<html>
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1, maximum-scale=1, user-scalable=no" name="viewport"/>
  <title>
   Machine Learing With Spark Note 3：构建分类器  | 数螺 | NAUT IDEA
  </title>
  <link href="http://cdn.bootcss.com/bootstrap/3.3.6/css/bootstrap-theme.min.css" rel="stylesheet"/>
  <link href="http://cdn.bootcss.com/bootstrap/3.3.6/css/bootstrap.min.css" rel="stylesheet"/>
  <style type="text/css">
   #xmain img {
                  max-width: 100%;
                  display: block;
                  margin-top: 10px;
                  margin-bottom: 10px;
                }

                #xmain p {
                    line-height:150%;
                    font-size: 16px;
                    margin-top: 20px;
                }

                #xmain h2 {
                    font-size: 24px;
                }

                #xmain h3 {
                    font-size: 20px;
                }

                #xmain h4 {
                    font-size: 18px;
                }


                .header {
	           background-color: #0099ff;
	           color: #ffffff;
	           margin-bottom: 20px;
	        }

	        .header p {
                  margin: 0px;
                  padding: 10px 0;
                  display: inline-block;  
                  vertical-align: middle;
                  font-size: 16px;
               }

               .header a {
                 color: white;
               }

              .header img {
                 height: 25px;
              }
  </style>
  <script src="http://cdn.bootcss.com/jquery/3.0.0/jquery.min.js">
  </script>
  <script src="http://nautstatic-10007657.file.myqcloud.com/static/css/readability.min.js" type="text/javascript">
  </script>
  <script type="text/javascript">
   $(document).ready(function() {
                 var loc = document.location;
                 var uri = {
                  spec: "http://dataunion.org/22574.html",
                  host: "http://dataunion.org",
                  prePath: "http://dataunion.org",
                  scheme: "http",
                  pathBase: "http://dataunion.org/"
                 };
    
                 var documentClone = document.cloneNode(true);
                 var article = new Readability(uri, documentClone).parse();
     
                 document.getElementById("xmain").innerHTML = article.content;
                });
  </script>
  <!-- 1466457402: Accept with keywords: (title(0.5):分类器,Spark,数盟,社区, topn(0.666666666667):社区,概率,数盟,分类器,行业资讯,垃圾邮件,数据挖掘,语文,变量,Python,模型,职业规划,参数,文章,基础架构,Spark,原始数据,向量,函数,性能,算法,spark,编程语言,数据,代码,分类,标准化,特征,课程,决策树).-->
 </head>
 <body onload="">
  <div class="header">
   <div class="container">
    <div class="row">
     <div class="col-xs-6 col-sm-6 text-left">
      <a href="/databee">
       <img src="http://nautidea-10007657.cos.myqcloud.com/logo_white.png"/>
      </a>
      <a href="/databee">
       <p>
        数螺
       </p>
      </a>
     </div>
     <div class="hidden-xs col-sm-6 text-right">
      <p>
       致力于数据科学的推广和知识传播
      </p>
     </div>
    </div>
   </div>
  </div>
  <div class="container text-center">
   <h1>
    Machine Learing With Spark Note 3：构建分类器
   </h1>
  </div>
  <div class="container" id="xmain">
   ﻿﻿
   <title>
    Machine Learing With Spark Note 3：构建分类器 | 数盟社区
   </title>
   <!-- All in One SEO Pack 2.2.7.6.2 by Michael Torbert of Semper Fi Web Design[32,86] -->
   <!-- /all in one seo pack -->
   <!--
<div align="center">
<a href="http://strata.oreilly.com.cn/hadoop-big-data-cn?cmp=mp-data-confreg-home-stcn16_dataunion_pc" target="_blank"><img src="http://dataunion.org/wp-content/uploads/2016/05/stratabj.jpg"/ ></a>
</div>
-->
   <header id="header-web">
    <div class="header-main">
     <hgroup class="logo">
      <h1>
       <a href="http://dataunion.org/" rel="home" title="数盟社区">
        <img src="http://dataunion.org/wp-content/themes/yzipi/images/logo.png"/>
       </a>
      </h1>
     </hgroup>
     <!--logo-->
     <nav class="header-nav">
      <ul class="menu" id="menu-%e4%b8%bb%e8%8f%9c%e5%8d%95">
       <li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-has-children menu-item-71" id="menu-item-71">
        <a href="http://dataunion.org/category/events" title="events">
         活动
        </a>
        <ul class="sub-menu">
         <li class="menu-item menu-item-type-post_type menu-item-object-page menu-item-22457" id="menu-item-22457">
          <a href="http://dataunion.org/2016timeline">
           2016档期
          </a>
         </li>
         <li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-22459" id="menu-item-22459">
          <a href="http://dataunion.org/category/parterc">
           合作会议
          </a>
         </li>
        </ul>
       </li>
       <li class="menu-item menu-item-type-taxonomy menu-item-object-category current-post-ancestor menu-item-has-children menu-item-20869" id="menu-item-20869">
        <a href="http://dataunion.org/category/tech" title="articles">
         文章
        </a>
        <ul class="sub-menu">
         <li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-20867" id="menu-item-20867">
          <a href="http://dataunion.org/category/tech/base" title="base">
           基础架构
          </a>
         </li>
         <li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-3302" id="menu-item-3302">
          <a href="http://dataunion.org/category/tech/ai" title="ai">
           人工智能
          </a>
         </li>
         <li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-3303" id="menu-item-3303">
          <a href="http://dataunion.org/category/tech/analysis" title="analysis">
           数据分析
          </a>
         </li>
         <li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-21920" id="menu-item-21920">
          <a href="http://dataunion.org/category/tech/dm">
           数据挖掘
          </a>
         </li>
         <li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-3314" id="menu-item-3314">
          <a href="http://dataunion.org/category/tech/viz" title="viz">
           可视化
          </a>
         </li>
         <li class="menu-item menu-item-type-taxonomy menu-item-object-category current-post-ancestor current-menu-parent current-post-parent menu-item-3305" id="menu-item-3305">
          <a href="http://dataunion.org/category/tech/devl" title="devl">
           编程语言
          </a>
         </li>
        </ul>
       </li>
       <li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-has-children menu-item-20876" id="menu-item-20876">
        <a href="http://dataunion.org/category/industry">
         行业
        </a>
        <ul class="sub-menu">
         <li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-16328" id="menu-item-16328">
          <a href="http://dataunion.org/category/industry/case" title="case">
           行业应用
          </a>
         </li>
         <li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-2112" id="menu-item-2112">
          <a href="http://dataunion.org/category/industry/demo" title="demo">
           Demo展示
          </a>
         </li>
         <li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-21562" id="menu-item-21562">
          <a href="http://dataunion.org/category/industry/news">
           行业资讯
          </a>
         </li>
        </ul>
       </li>
       <li class="menu-item menu-item-type-taxonomy menu-item-object-category current-post-ancestor current-menu-parent current-post-parent menu-item-311" id="menu-item-311">
        <a href="http://dataunion.org/category/sources" title="sources">
         资源
        </a>
       </li>
       <li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-20870" id="menu-item-20870">
        <a href="http://dataunion.org/category/books" title="book">
         图书
        </a>
       </li>
       <li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-21363" id="menu-item-21363">
        <a href="http://dataunion.org/category/training">
         课程
        </a>
       </li>
       <li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-has-children menu-item-21853" id="menu-item-21853">
        <a href="http://dataunion.org/category/jobs">
         职位
        </a>
        <ul class="sub-menu">
         <li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-22050" id="menu-item-22050">
          <a href="http://dataunion.org/category/career">
           职业规划
          </a>
         </li>
        </ul>
       </li>
      </ul>
     </nav>
     <!--header-nav-->
    </div>
   </header>
   <!--header-web-->
   <div id="main">
    <div id="soutab">
     <form action="http://dataunion.org/" class="search" method="get">
     </form>
    </div>
    <div id="container">
     <nav id="mbx">
      当前位置：
      <a href="http://dataunion.org">
       首页
      </a>
      &gt;
      <a href="http://dataunion.org/category/sources">
       资源
      </a>
      &gt;  正文
     </nav>
     <!--mbx-->
     <article class="content">
      <header align="centre" class="contenttitle">
       <div class="mscc">
        <h1 class="mscctitle">
         <a href="http://dataunion.org/22574.html">
          Machine Learing With Spark Note 3：构建分类器
         </a>
        </h1>
        <address class="msccaddress ">
         <em>
          2,294 次阅读 -
         </em>
         <a href="http://dataunion.org/category/sources" rel="category tag">
          资源
         </a>
         ,
         <a href="http://dataunion.org/category/tech/devl" rel="category tag">
          编程语言
         </a>
        </address>
       </div>
      </header>
      <div class="content-text">
       <p>
        本文为数盟特约作者投稿，欢迎转载，请注明出处“数盟社区”和作者
       </p>
       <p>
        博主简介：段石石，1号店精准化推荐算法工程师，主要负责1号店用户画像构建，喜欢钻研点Machine Learning的黑科技，对Deep Learning感兴趣，喜欢玩kaggle、看9神，对数据和Machine Learning有兴趣咱们可以一起聊聊，个人博客：
        <a href="http://hacker.duanshishi.com/" target="_blank">
         hacker.duanshishi.com
        </a>
       </p>
       <p>
       </p>
       <h2 id="wiz_toc_0">
        Spark构建分类器
       </h2>
       <p>
        在本章中，我们会了解基本的分类器以及在Spark如何使用，以及一套如何对model进行评价、调参。MLlib在这一块还是比较强大的，但是对比sklearn无论是算法种类以及配套功能还是有很大的差距。不过，据传spark最近正在修改ml，参考sklearn中的pipeline框架，将所有对数据的操作写成一个管道，在model的选择、调参、评估将更加方便，像sklearn一样,下面是一些Kaggle比赛当中的一些代码，用一个Pipeline把数据流的所有操作集合在一起，这样就很方便地进行调参。
       </p>
       <!-- Crayon Syntax Highlighter v_2.7.2_beta -->
       <div class="crayon-syntax crayon-theme-classic crayon-font-monaco crayon-os-pc print-yes notranslate" data-settings=" minimize scroll-mouseover" id="crayon-57685d3677e0d558836815" style=" margin-top: 12px; margin-bottom: 12px; font-size: 12px !important; line-height: 15px !important;">
        <div class="crayon-toolbar" data-settings=" mouseover overlay hide delay" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
         <span class="crayon-title">
         </span>
         <div class="crayon-tools" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
          <div class="crayon-button crayon-nums-button" title="切换是否显示行编号">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-plain-button" title="纯文本显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-wrap-button" title="切换自动换行">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-expand-button" title="点击展开代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-copy-button" title="复制代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-popup-button" title="在新窗口中显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <span class="crayon-language">
           Python
          </span>
         </div>
        </div>
        <div class="crayon-info" style="min-height: 16.8px !important; line-height: 16.8px !important;">
        </div>
        <div class="crayon-plain-wrap">
         <textarea class="crayon-plain print-no" data-settings="dblclick" readonly="" style="-moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4; font-size: 12px !important; line-height: 15px !important;" wrap="soft">
          clf = pipeline.Pipeline([
('union', FeatureUnion(
        transformer_list = [
            ('cst',  cust_regression_vals()),
            ('txt1', pipeline.Pipeline([('s1', cust_txt_col(key='search_term')), ('tfidf1', tfidf)])),
            ('txt2', pipeline.Pipeline([('s2', cust_txt_col(key='product_title')), ('tfidf2', tfidf), ('tsvd2', tsvd)])),
            ('txt3', pipeline.Pipeline([('s3', cust_txt_col(key='product_description')), ('tfidf3', tfidf), ('tsvd3', tsvd)])),
            ('txt4', pipeline.Pipeline([('s4', cust_txt_col(key='brand')), ('tfidf4', tfidf), ('tsvd4', tsvd)]))
        ],
        transformer_weights = {
            'cst': 1.0,
            'txt1': 0.5,
            'txt2': 0.25,
            'txt3': 0.0,
            'txt4': 0.5
        },
        n_jobs = 1
)),
('xgbr', xgbr)])
         </textarea>
        </div>
        <div class="crayon-main" style="">
         <table class="crayon-table">
          <tbody>
           <tr class="crayon-row">
            <td class="crayon-nums " data-settings="show">
             <div class="crayon-nums-content" style="font-size: 12px !important; line-height: 15px !important;">
              <div class="crayon-num" data-line="crayon-57685d3677e0d558836815-1">
               1
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e0d558836815-2">
               2
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e0d558836815-3">
               3
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e0d558836815-4">
               4
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e0d558836815-5">
               5
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e0d558836815-6">
               6
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e0d558836815-7">
               7
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e0d558836815-8">
               8
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e0d558836815-9">
               9
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e0d558836815-10">
               10
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e0d558836815-11">
               11
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e0d558836815-12">
               12
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e0d558836815-13">
               13
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e0d558836815-14">
               14
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e0d558836815-15">
               15
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e0d558836815-16">
               16
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e0d558836815-17">
               17
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e0d558836815-18">
               18
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e0d558836815-19">
               19
              </div>
             </div>
            </td>
            <td class="crayon-code">
             <div class="crayon-pre" style="font-size: 12px !important; line-height: 15px !important; -moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4;">
              <div class="crayon-line" id="crayon-57685d3677e0d558836815-1">
               <span class="crayon-v">
                clf
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                pipeline
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                Pipeline
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                [
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e0d558836815-2">
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-s">
                'union'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                FeatureUnion
               </span>
               <span class="crayon-sy">
                (
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e0d558836815-3">
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                transformer_list
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                [
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e0d558836815-4">
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-s">
                'cst'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                cust_regression_vals
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                ,
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e0d558836815-5">
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-s">
                'txt1'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                pipeline
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                Pipeline
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-s">
                's1'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                cust_txt_col
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                key
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-s">
                'search_term'
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-s">
                'tfidf1'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                tfidf
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                ,
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e0d558836815-6">
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-s">
                'txt2'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                pipeline
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                Pipeline
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-s">
                's2'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                cust_txt_col
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                key
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-s">
                'product_title'
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-s">
                'tfidf2'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                tfidf
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-s">
                'tsvd2'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                tsvd
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                ,
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e0d558836815-7">
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-s">
                'txt3'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                pipeline
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                Pipeline
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-s">
                's3'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                cust_txt_col
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                key
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-s">
                'product_description'
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-s">
                'tfidf3'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                tfidf
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-s">
                'tsvd3'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                tsvd
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                ,
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e0d558836815-8">
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-s">
                'txt4'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                pipeline
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                Pipeline
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-s">
                's4'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                cust_txt_col
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                key
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-s">
                'brand'
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-s">
                'tfidf4'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                tfidf
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-s">
                'tsvd4'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                tsvd
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e0d558836815-9">
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-sy">
                ,
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e0d558836815-10">
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                transformer_weights
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                {
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e0d558836815-11">
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                'cst'
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                1.0
               </span>
               <span class="crayon-sy">
                ,
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e0d558836815-12">
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                'txt1'
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                0.5
               </span>
               <span class="crayon-sy">
                ,
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e0d558836815-13">
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                'txt2'
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                0.25
               </span>
               <span class="crayon-sy">
                ,
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e0d558836815-14">
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                'txt3'
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                0.0
               </span>
               <span class="crayon-sy">
                ,
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e0d558836815-15">
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                'txt4'
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                0.5
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e0d558836815-16">
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                }
               </span>
               <span class="crayon-sy">
                ,
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e0d558836815-17">
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                n_jobs
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                1
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e0d558836815-18">
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                ,
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e0d558836815-19">
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-s">
                'xgbr'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                xgbr
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
        </div>
       </div>
       <!-- [Format Time: 0.0481 seconds] -->
       <p>
       </p>
       <p>
        下面我们将分为以下几部分来聊下Spark MLlib中的分类器模块：
        <br/>
        <img src="http://7xr3b9.com1.z0.glb.clouddn.com/mac_blogs_classifier_spark1.png"/>
       </p>
       <ul>
        <li>
         了解MLlib中支持的基本的分类器算法
        </li>
        <li>
         利用Spark从原始数据当中提取特征
        </li>
        <li>
         利用MLlib训练各种有代表性的模型
        </li>
        <li>
         使用训练好的模型对数据进行预测
        </li>
        <li>
         使用标准的评估手段对分类器模型来进行评估
        </li>
        <li>
         使用一些数据处理的方法来提升model性能
        </li>
        <li>
         探索在Spark MLlib如何进行Hyperparameter tuning，以及使用CV，来选择对应最优参数
        </li>
       </ul>
       <h3 id="wiz_toc_1">
        MLlib中支持的分类器算法
       </h3>
       <h4 id="wiz_toc_2">
        Linear models
       </h4>
       <p>
        线性模型，顾名思义，在空间定一条直线来分割数据，从而来对数据进行判断，基本的model：
       </p>
       <div>
        <img src="http://7xr3b9.com1.z0.glb.clouddn.com/mac_blogs_linermodel_spark1.png"/>
       </div>
       <p>
        其中，y是目标变量，w是model的权重向量，x是输入的特征向量。这里我们可以变化f来更改model。
        <br/>
        f确定后，一般会对应的decost函数。然后，我们在权重向量的参数空间寻优，找到cost函数值最小的一组最优参数，常用的cost函数包括logistic loss（logistic regression）、hinge loss（Linear Support Vector）以及最常见的Zero-one loss:
       </p>
       <div>
        <img src="http://7xr3b9.com1.z0.glb.clouddn.com/mac_blogs_linearmodel_loss_function.png"/>
       </div>
       <h5 id="wiz_toc_3">
        Logistic regression
       </h5>
       <p>
        在Logistic Regression中，f就是所谓的sigmoid函数：
       </p>
       <div>
        <img src="http://7xr3b9.com1.z0.glb.clouddn.com/mac_blogs_linearmodel_sigmoid.png"/>
       </div>
       <h5 id="wiz_toc_4">
        Linear Support Vector Machines
       </h5>
       <p>
        在线性支持向量机中，f就是一个对等函数（？这里其实我也不知道为啥是这个名字），也就是本身：
       </p>
       <div>
        <img src="http://7xr3b9.com1.z0.glb.clouddn.com/mac_blogs_linearmodel_lsvm1.png"/>
       </div>
       <p>
        在Linear Support Vector Machines中，我们使用的cost函数为
        <strong>
         hinge loss
        </strong>
        ：
       </p>
       <div>
        <img src="http://7xr3b9.com1.z0.glb.clouddn.com/mac_blogs_linearmodel_lsvm2.png"/>
       </div>
       <p>
        Logistic Regression和Support Vector Machines的分割线示意图：
        <br/>
        <img src="http://7xr3b9.com1.z0.glb.clouddn.com/mac_blogs_linearmodel_lsvm3.png"/>
       </p>
       <h4 id="wiz_toc_5">
        Naive Bayes Model
       </h4>
       <p>
        Naive Bayes要求特征质检条件独立，是一种实际当中应用很多的分类方法
        <br/>
        <img src="http://7xr3b9.com1.z0.glb.clouddn.com/mac_blogs_naive_bayes.png"/>
        <br/>
        特征之间的属于类变量的概率相互独立，然后计算所有类变量，选择概率最大的那个C即是我们分给的类别。
        <br/>
        一个简单的二值分类器的结果：
        <br/>
        <img src="http://7xr3b9.com1.z0.glb.clouddn.com/mac_blogs_naive_bayes_result.png"/>
       </p>
       <h4 id="wiz_toc_6">
        Decision trees
       </h4>
       <p>
        决策树的基本原理就是通过某些metrics选出最重要的属性node来对数据进行分割，然后依次进行分割，决策树是一个很流行的算法，也是一种很容易过拟合的算法，为了减少过拟合的产生，有其他ensemble的高级版，如Random Forest、GBDT，用来增强决策树算法的性能和鲁棒性
        <br/>
        <img src="http://7xr3b9.com1.z0.glb.clouddn.com/mac_blogs_DecisionTreeSimpleClassifier.png"/>
        <br/>
        一个简单的决策树
        <br/>
        <img src="http://7xr3b9.com1.z0.glb.clouddn.com/mac_blogs_DecisionTreeSimpleClassifierResult.png"/>
       </p>
       <h3 id="wiz_toc_7">
        从原始数据中提取合适的特征
       </h3>
       <p>
        在Supervised Learning中，提供LabeledPoint数据类型，
       </p>
       <!-- Crayon Syntax Highlighter v_2.7.2_beta -->
       <div class="crayon-syntax crayon-theme-classic crayon-font-monaco crayon-os-pc print-yes notranslate" data-settings=" minimize scroll-mouseover" id="crayon-57685d3677e1f832603667" style=" margin-top: 12px; margin-bottom: 12px; font-size: 12px !important; line-height: 15px !important;">
        <div class="crayon-toolbar" data-settings=" mouseover overlay hide delay" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
         <span class="crayon-title">
         </span>
         <div class="crayon-tools" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
          <div class="crayon-button crayon-nums-button" title="切换是否显示行编号">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-plain-button" title="纯文本显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-wrap-button" title="切换自动换行">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-expand-button" title="点击展开代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-copy-button" title="复制代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-popup-button" title="在新窗口中显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <span class="crayon-language">
           Python
          </span>
         </div>
        </div>
        <div class="crayon-info" style="min-height: 16.8px !important; line-height: 16.8px !important;">
        </div>
        <div class="crayon-plain-wrap">
         <textarea class="crayon-plain print-no" data-settings="dblclick" readonly="" style="-moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4; font-size: 12px !important; line-height: 15px !important;" wrap="soft">
          case class LabeledPoint(label: Double, features: Vector)
         </textarea>
        </div>
        <div class="crayon-main" style="">
         <table class="crayon-table">
          <tbody>
           <tr class="crayon-row">
            <td class="crayon-nums " data-settings="show">
             <div class="crayon-nums-content" style="font-size: 12px !important; line-height: 15px !important;">
              <div class="crayon-num" data-line="crayon-57685d3677e1f832603667-1">
               1
              </div>
             </div>
            </td>
            <td class="crayon-code">
             <div class="crayon-pre" style="font-size: 12px !important; line-height: 15px !important; -moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4;">
              <div class="crayon-line" id="crayon-57685d3677e1f832603667-1">
               <span class="crayon-e">
                case
               </span>
               <span class="crayon-t">
                class
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                LabeledPoint
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                label
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                Double
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                features
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                Vector
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
        </div>
       </div>
       <!-- [Format Time: 0.0013 seconds] -->
       <p>
       </p>
       <h4 id="wiz_toc_8">
        从Kaggle StumbleUpon evergreen Dataset提取features
       </h4>
       <p>
       </p>
       <!-- Crayon Syntax Highlighter v_2.7.2_beta -->
       <div class="crayon-syntax crayon-theme-classic crayon-font-monaco crayon-os-pc print-yes notranslate" data-settings=" minimize scroll-mouseover" id="crayon-57685d3677e26127778219" style=" margin-top: 12px; margin-bottom: 12px; font-size: 12px !important; line-height: 15px !important;">
        <div class="crayon-toolbar" data-settings=" mouseover overlay hide delay" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
         <span class="crayon-title">
         </span>
         <div class="crayon-tools" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
          <div class="crayon-button crayon-nums-button" title="切换是否显示行编号">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-plain-button" title="纯文本显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-wrap-button" title="切换自动换行">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-expand-button" title="点击展开代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-copy-button" title="复制代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-popup-button" title="在新窗口中显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <span class="crayon-language">
           Python
          </span>
         </div>
        </div>
        <div class="crayon-info" style="min-height: 16.8px !important; line-height: 16.8px !important;">
        </div>
        <div class="crayon-plain-wrap">
         <textarea class="crayon-plain print-no" data-settings="dblclick" readonly="" style="-moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4; font-size: 12px !important; line-height: 15px !important;" wrap="soft">
          # 去掉train中的header信息
!sed 1d ../data/evergreen_classification/train.tsv &gt; ../data/evergreen_classification/train_noheader.tsv
# 读入数据，以\t分割
rawData = sc.textFile('../data/evergreen_classification/train_noheader.tsv')
records = rawData.map(lambda x : x.split('\t'))
records.take(4)
         </textarea>
        </div>
        <div class="crayon-main" style="">
         <table class="crayon-table">
          <tbody>
           <tr class="crayon-row">
            <td class="crayon-nums " data-settings="show">
             <div class="crayon-nums-content" style="font-size: 12px !important; line-height: 15px !important;">
              <div class="crayon-num" data-line="crayon-57685d3677e26127778219-1">
               1
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e26127778219-2">
               2
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e26127778219-3">
               3
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e26127778219-4">
               4
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e26127778219-5">
               5
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e26127778219-6">
               6
              </div>
             </div>
            </td>
            <td class="crayon-code">
             <div class="crayon-pre" style="font-size: 12px !important; line-height: 15px !important; -moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4;">
              <div class="crayon-line" id="crayon-57685d3677e26127778219-1">
               <span class="crayon-c">
                # 去掉train中的header信息
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e26127778219-2">
               <span class="crayon-o">
                !
               </span>
               <span class="crayon-i">
                sed
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                1d
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-o">
                /
               </span>
               <span class="crayon-v">
                data
               </span>
               <span class="crayon-o">
                /
               </span>
               <span class="crayon-v">
                evergreen_classification
               </span>
               <span class="crayon-o">
                /
               </span>
               <span class="crayon-v">
                train
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                tsv
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                &gt;
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-o">
                /
               </span>
               <span class="crayon-v">
                data
               </span>
               <span class="crayon-o">
                /
               </span>
               <span class="crayon-v">
                evergreen_classification
               </span>
               <span class="crayon-o">
                /
               </span>
               <span class="crayon-v">
                train_noheader
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                tsv
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e26127778219-3">
               <span class="crayon-c">
                # 读入数据，以\t分割
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e26127778219-4">
               <span class="crayon-v">
                rawData
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                sc
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                textFile
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-s">
                '../data/evergreen_classification/train_noheader.tsv'
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e26127778219-5">
               <span class="crayon-v">
                records
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                rawData
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                split
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-s">
                '\t'
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e26127778219-6">
               <span class="crayon-v">
                records
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                take
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-cn">
                4
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
        </div>
       </div>
       <!-- [Format Time: 0.0128 seconds] -->
       <p>
       </p>
       <p>
        数据内容如图：
        <br/>
        <img src="http://7xr3b9.com1.z0.glb.clouddn.com/mac_blogs_kaggle_dataset.png"/>
        <br/>
        取其中有用字段，并做初步处理（将？取代为0.0）
       </p>
       <!-- Crayon Syntax Highlighter v_2.7.2_beta -->
       <div class="crayon-syntax crayon-theme-classic crayon-font-monaco crayon-os-pc print-yes notranslate" data-settings=" minimize scroll-mouseover" id="crayon-57685d3677e2d897254649" style=" margin-top: 12px; margin-bottom: 12px; font-size: 12px !important; line-height: 15px !important;">
        <div class="crayon-toolbar" data-settings=" mouseover overlay hide delay" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
         <span class="crayon-title">
         </span>
         <div class="crayon-tools" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
          <div class="crayon-button crayon-nums-button" title="切换是否显示行编号">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-plain-button" title="纯文本显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-wrap-button" title="切换自动换行">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-expand-button" title="点击展开代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-copy-button" title="复制代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-popup-button" title="在新窗口中显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <span class="crayon-language">
           Python
          </span>
         </div>
        </div>
        <div class="crayon-info" style="min-height: 16.8px !important; line-height: 16.8px !important;">
        </div>
        <div class="crayon-plain-wrap">
         <textarea class="crayon-plain print-no" data-settings="dblclick" readonly="" style="-moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4; font-size: 12px !important; line-height: 15px !important;" wrap="soft">
          from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.linalg import Vectors
trimmed = records.map(lambda x: [xx.replace('\\',' ') for xx in x])
# data.first()
label = trimmed.map(lambda x : x[-1])
# label.take(5)
# features =  trimmed.map(lambda x: x[4:-1]).map(lambda x: [ 0.0 if x=='?' else float(xx.replace("\"","")) for xx in x])
# data = LabeledPoint(label,Vectors.dense(features))
# data = trimmed.map(lambda x:(x[-1],x[4:-1])).map(lambda (x,y): (x,[ 0.0 if yy =='?' else float(yy.replace("\"","")) for yy in y])).map(LabeledPoint(label,features))
# ?号时，文本里面存的是"?"
data = trimmed.map(lambda x:(x[-1],x[4:-1])).map(lambda (x,y): (x.replace("\"","") ,[ 0.0 if yy =='\"?\"' else yy.replace("\"","") for yy in y])).map(lambda (x,y):(int(x),[float(yy) for yy in y])).map(lambda (x,y):LabeledPoint(x,Vectors.dense(y)))
# features.take(5)
data.take(5)
         </textarea>
        </div>
        <div class="crayon-main" style="">
         <table class="crayon-table">
          <tbody>
           <tr class="crayon-row">
            <td class="crayon-nums " data-settings="show">
             <div class="crayon-nums-content" style="font-size: 12px !important; line-height: 15px !important;">
              <div class="crayon-num" data-line="crayon-57685d3677e2d897254649-1">
               1
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e2d897254649-2">
               2
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e2d897254649-3">
               3
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e2d897254649-4">
               4
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e2d897254649-5">
               5
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e2d897254649-6">
               6
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e2d897254649-7">
               7
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e2d897254649-8">
               8
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e2d897254649-9">
               9
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e2d897254649-10">
               10
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e2d897254649-11">
               11
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e2d897254649-12">
               12
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e2d897254649-13">
               13
              </div>
             </div>
            </td>
            <td class="crayon-code">
             <div class="crayon-pre" style="font-size: 12px !important; line-height: 15px !important; -moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4;">
              <div class="crayon-line" id="crayon-57685d3677e2d897254649-1">
               <span class="crayon-st">
                from
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                pyspark
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                mllib
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                regression
               </span>
               <span class="crayon-r">
                import
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                LabeledPoint
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e2d897254649-2">
               <span class="crayon-st">
                from
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                pyspark
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                mllib
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                linalg
               </span>
               <span class="crayon-r">
                import
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                Vectors
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e2d897254649-3">
               <span class="crayon-v">
                trimmed
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                records
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-v">
                xx
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                replace
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-s">
                '\\'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-s">
                ' '
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                for
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                xx
               </span>
               <span class="crayon-st">
                in
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e2d897254649-4">
               <span class="crayon-c">
                # data.first()
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e2d897254649-5">
               <span class="crayon-v">
                label
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                trimmed
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-o">
                -
               </span>
               <span class="crayon-cn">
                1
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e2d897254649-6">
               <span class="crayon-c">
                # label.take(5)
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e2d897254649-7">
               <span class="crayon-c">
                # features =  trimmed.map(lambda x: x[4:-1]).map(lambda x: [ 0.0 if x=='?' else float(xx.replace("\"","")) for xx in x])
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e2d897254649-8">
               <span class="crayon-c">
                # data = LabeledPoint(label,Vectors.dense(features))
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e2d897254649-9">
               <span class="crayon-c">
                # data = trimmed.map(lambda x:(x[-1],x[4:-1])).map(lambda (x,y): (x,[ 0.0 if yy =='?' else float(yy.replace("\"","")) for yy in y])).map(LabeledPoint(label,features))
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e2d897254649-10">
               <span class="crayon-c">
                # ?号时，文本里面存的是"?"
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e2d897254649-11">
               <span class="crayon-v">
                data
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                trimmed
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-o">
                -
               </span>
               <span class="crayon-cn">
                1
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-cn">
                4
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-o">
                -
               </span>
               <span class="crayon-cn">
                1
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                y
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                replace
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-s">
                "\""
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-s">
                ""
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                0.0
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                if
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                yy
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                ==
               </span>
               <span class="crayon-s">
                '\"?\"'
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                else
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                yy
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                replace
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-s">
                "\""
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-s">
                ""
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                for
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                yy
               </span>
               <span class="crayon-st">
                in
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                y
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                y
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-k ">
                int
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-k ">
                float
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                yy
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                for
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                yy
               </span>
               <span class="crayon-st">
                in
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                y
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                y
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-e">
                LabeledPoint
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                Vectors
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                dense
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                y
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e2d897254649-12">
               <span class="crayon-c">
                # features.take(5)
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e2d897254649-13">
               <span class="crayon-v">
                data
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                take
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-cn">
                5
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
        </div>
       </div>
       <!-- [Format Time: 0.0382 seconds] -->
       <p>
       </p>
       <p>
        这里有一个小的细节就是里面存的是”123”而非123，在做处理时需要注意，这里代码写的比较粗糙，就先这样看看，后面再做类似处理的时候回先把这些”“处理掉，scala的代码中没有出现问题，具体不知道为什么，不过这个是小问题，注意下就可以了，这里就生成了后面做分类的数据结构LabeledPoint，很简单是不是。
        <br/>
        下面，我们意义处理下nbData，为后面做Naive Bayes的数据，因为NB中是不允许存在负数的，这个很好理解，概率是不存在负的，对吧，但是数据当中有些，这里我们先不看具体意义，直接和书上一样，把负数做0.0处理，实际当中可能需要具体了解数据库，或者可能会对原先的数据进行一个概率统计才能用相关的Naive Bayes的算法。
       </p>
       <!-- Crayon Syntax Highlighter v_2.7.2_beta -->
       <div class="crayon-syntax crayon-theme-classic crayon-font-monaco crayon-os-pc print-yes notranslate" data-settings=" minimize scroll-mouseover" id="crayon-57685d3677e35967468360" style=" margin-top: 12px; margin-bottom: 12px; font-size: 12px !important; line-height: 15px !important;">
        <div class="crayon-toolbar" data-settings=" mouseover overlay hide delay" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
         <span class="crayon-title">
         </span>
         <div class="crayon-tools" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
          <div class="crayon-button crayon-nums-button" title="切换是否显示行编号">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-plain-button" title="纯文本显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-wrap-button" title="切换自动换行">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-expand-button" title="点击展开代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-copy-button" title="复制代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-popup-button" title="在新窗口中显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <span class="crayon-language">
           Python
          </span>
         </div>
        </div>
        <div class="crayon-info" style="min-height: 16.8px !important; line-height: 16.8px !important;">
        </div>
        <div class="crayon-plain-wrap">
         <textarea class="crayon-plain print-no" data-settings="dblclick" readonly="" style="-moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4; font-size: 12px !important; line-height: 15px !important;" wrap="soft">
          # naive bayes要求feature为非负features
nbdata = trimmed.map(lambda x:(x[-1],x[4:-1])).map(lambda (x,y): (int(x.replace("\"","")) ,[ 0.0 if yy =='\"?\"' else float(yy.replace("\"","")) for yy in y])).map(lambda (x,y): (x,[0.0 if yy&lt;0 else yy for yy in y])).map(lambda (x,y):LabeledPoint(x,Vectors.dense(y)))
# nbdata = trimmed.map(lambda x:(x[-1],x[4:-1])).map(lambda (x,y): (x.replace("\"","") ,[ 0.0 if yy =='\"?\"' else yy.replace("\"","") for yy in y])).map(lambda (x,y):(int(x),[float(yy) for yy in y])).map(lambda (x,y):[0.0 if yy&lt;0  else float(yy) for yy in y]).map(lambda (x,y):LabeledPoint(x,Vectors.dense(y)))
print nbdata.take(5)
# nbdata.cache
         </textarea>
        </div>
        <div class="crayon-main" style="">
         <table class="crayon-table">
          <tbody>
           <tr class="crayon-row">
            <td class="crayon-nums " data-settings="show">
             <div class="crayon-nums-content" style="font-size: 12px !important; line-height: 15px !important;">
              <div class="crayon-num" data-line="crayon-57685d3677e35967468360-1">
               1
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e35967468360-2">
               2
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e35967468360-3">
               3
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e35967468360-4">
               4
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e35967468360-5">
               5
              </div>
             </div>
            </td>
            <td class="crayon-code">
             <div class="crayon-pre" style="font-size: 12px !important; line-height: 15px !important; -moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4;">
              <div class="crayon-line" id="crayon-57685d3677e35967468360-1">
               <span class="crayon-c">
                # naive bayes要求feature为非负features
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e35967468360-2">
               <span class="crayon-v">
                nbdata
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                trimmed
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-o">
                -
               </span>
               <span class="crayon-cn">
                1
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-cn">
                4
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-o">
                -
               </span>
               <span class="crayon-cn">
                1
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                y
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-k ">
                int
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                replace
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-s">
                "\""
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-s">
                ""
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                0.0
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                if
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                yy
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                ==
               </span>
               <span class="crayon-s">
                '\"?\"'
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                else
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-k ">
                float
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                yy
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                replace
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-s">
                "\""
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-s">
                ""
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                for
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                yy
               </span>
               <span class="crayon-st">
                in
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                y
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                y
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-cn">
                0.0
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                if
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                yy
               </span>
               <span class="crayon-o">
                &lt;
               </span>
               <span class="crayon-cn">
                0
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                else
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                yy
               </span>
               <span class="crayon-st">
                for
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                yy
               </span>
               <span class="crayon-st">
                in
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                y
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                y
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-e">
                LabeledPoint
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                Vectors
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                dense
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                y
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e35967468360-3">
               <span class="crayon-c">
                # nbdata = trimmed.map(lambda x:(x[-1],x[4:-1])).map(lambda (x,y): (x.replace("\"","") ,[ 0.0 if yy =='\"?\"' else yy.replace("\"","") for yy in y])).map(lambda (x,y):(int(x),[float(yy) for yy in y])).map(lambda (x,y):[0.0 if yy&lt;0  else float(yy) for yy in y]).map(lambda (x,y):LabeledPoint(x,Vectors.dense(y)))
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e35967468360-4">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                nbdata
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                take
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-cn">
                5
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e35967468360-5">
               <span class="crayon-c">
                # nbdata.cache
               </span>
              </div>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
        </div>
       </div>
       <!-- [Format Time: 0.0428 seconds] -->
       <p>
       </p>
       <h3 id="wiz_toc_9">
        模型训练
       </h3>
       <p>
        这部分，我们直接调用Spark MLlib里面的分类器的接口，然后训练好对应的LR、SVM、NB、DT
       </p>
       <!-- Crayon Syntax Highlighter v_2.7.2_beta -->
       <div class="crayon-syntax crayon-theme-classic crayon-font-monaco crayon-os-pc print-yes notranslate" data-settings=" minimize scroll-mouseover" id="crayon-57685d3677e3d076661481" style=" margin-top: 12px; margin-bottom: 12px; font-size: 12px !important; line-height: 15px !important;">
        <div class="crayon-toolbar" data-settings=" mouseover overlay hide delay" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
         <span class="crayon-title">
         </span>
         <div class="crayon-tools" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
          <div class="crayon-button crayon-nums-button" title="切换是否显示行编号">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-plain-button" title="纯文本显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-wrap-button" title="切换自动换行">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-expand-button" title="点击展开代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-copy-button" title="复制代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-popup-button" title="在新窗口中显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <span class="crayon-language">
           Python
          </span>
         </div>
        </div>
        <div class="crayon-info" style="min-height: 16.8px !important; line-height: 16.8px !important;">
        </div>
        <div class="crayon-plain-wrap">
         <textarea class="crayon-plain print-no" data-settings="dblclick" readonly="" style="-moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4; font-size: 12px !important; line-height: 15px !important;" wrap="soft">
          #Training a classifier using logistic regression, SVM, naïve Bayes, and a decision tree
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.classification import SVMWithSGD
from pyspark.mllib.classification import NaiveBayes
from pyspark.mllib.tree import DecisionTree
# import pyspark.mllib.tree.
numIteration = 10
maxTreeDepth = 5
numClass = label.distinct().count()
print numClass
lrModel = LogisticRegressionWithSGD.train(data, numIteration)
svmModel = SVMWithSGD.train(data, numIteration)
nbModel = NaiveBayes.train(nbdata)
# dtModel = DecisionTree.trainClassifier(data,2,impurity='entropy')
dtModel = DecisionTree.trainClassifier(data,numClass,{},impurity='entropy', maxDepth=maxTreeDepth)
print lrModel
print dtModel
         </textarea>
        </div>
        <div class="crayon-main" style="">
         <table class="crayon-table">
          <tbody>
           <tr class="crayon-row">
            <td class="crayon-nums " data-settings="show">
             <div class="crayon-nums-content" style="font-size: 12px !important; line-height: 15px !important;">
              <div class="crayon-num" data-line="crayon-57685d3677e3d076661481-1">
               1
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e3d076661481-2">
               2
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e3d076661481-3">
               3
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e3d076661481-4">
               4
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e3d076661481-5">
               5
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e3d076661481-6">
               6
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e3d076661481-7">
               7
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e3d076661481-8">
               8
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e3d076661481-9">
               9
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e3d076661481-10">
               10
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e3d076661481-11">
               11
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e3d076661481-12">
               12
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e3d076661481-13">
               13
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e3d076661481-14">
               14
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e3d076661481-15">
               15
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e3d076661481-16">
               16
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e3d076661481-17">
               17
              </div>
             </div>
            </td>
            <td class="crayon-code">
             <div class="crayon-pre" style="font-size: 12px !important; line-height: 15px !important; -moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4;">
              <div class="crayon-line" id="crayon-57685d3677e3d076661481-1">
               <span class="crayon-c">
                #Training a classifier using logistic regression, SVM, naïve Bayes, and a decision tree
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e3d076661481-2">
               <span class="crayon-st">
                from
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                pyspark
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                mllib
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                classification
               </span>
               <span class="crayon-r">
                import
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                LogisticRegressionWithSGD
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e3d076661481-3">
               <span class="crayon-st">
                from
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                pyspark
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                mllib
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                classification
               </span>
               <span class="crayon-r">
                import
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                SVMWithSGD
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e3d076661481-4">
               <span class="crayon-st">
                from
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                pyspark
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                mllib
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                classification
               </span>
               <span class="crayon-r">
                import
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                NaiveBayes
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e3d076661481-5">
               <span class="crayon-st">
                from
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                pyspark
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                mllib
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                tree
               </span>
               <span class="crayon-r">
                import
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                DecisionTree
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e3d076661481-6">
               <span class="crayon-c">
                # import pyspark.mllib.tree.
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e3d076661481-7">
               <span class="crayon-v">
                numIteration
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                10
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e3d076661481-8">
               <span class="crayon-v">
                maxTreeDepth
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                5
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e3d076661481-9">
               <span class="crayon-v">
                numClass
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                label
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                distinct
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                count
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e3d076661481-10">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                numClass
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e3d076661481-11">
               <span class="crayon-v">
                lrModel
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                LogisticRegressionWithSGD
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                train
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                data
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                numIteration
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e3d076661481-12">
               <span class="crayon-v">
                svmModel
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                SVMWithSGD
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                train
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                data
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                numIteration
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e3d076661481-13">
               <span class="crayon-v">
                nbModel
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                NaiveBayes
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                train
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                nbdata
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e3d076661481-14">
               <span class="crayon-c">
                # dtModel = DecisionTree.trainClassifier(data,2,impurity='entropy')
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e3d076661481-15">
               <span class="crayon-v">
                dtModel
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                DecisionTree
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                trainClassifier
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                data
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                numClass
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-sy">
                {
               </span>
               <span class="crayon-sy">
                }
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                impurity
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-s">
                'entropy'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                maxDepth
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-v">
                maxTreeDepth
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e3d076661481-16">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                lrModel
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e3d076661481-17">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                dtModel
               </span>
              </div>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
        </div>
       </div>
       <!-- [Format Time: 0.0220 seconds] -->
       <p>
       </p>
       <h3 id="wiz_toc_10">
        使用模型对数据进行预测
       </h3>
       <p>
        直接调用predict，对数据进行预测，很简单，直接看代码：
       </p>
       <!-- Crayon Syntax Highlighter v_2.7.2_beta -->
       <div class="crayon-syntax crayon-theme-classic crayon-font-monaco crayon-os-pc print-yes notranslate" data-settings=" minimize scroll-mouseover" id="crayon-57685d3677e43392377067" style=" margin-top: 12px; margin-bottom: 12px; font-size: 12px !important; line-height: 15px !important;">
        <div class="crayon-toolbar" data-settings=" mouseover overlay hide delay" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
         <span class="crayon-title">
         </span>
         <div class="crayon-tools" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
          <div class="crayon-button crayon-nums-button" title="切换是否显示行编号">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-plain-button" title="纯文本显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-wrap-button" title="切换自动换行">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-expand-button" title="点击展开代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-copy-button" title="复制代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-popup-button" title="在新窗口中显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <span class="crayon-language">
           Python
          </span>
         </div>
        </div>
        <div class="crayon-info" style="min-height: 16.8px !important; line-height: 16.8px !important;">
        </div>
        <div class="crayon-plain-wrap">
         <textarea class="crayon-plain print-no" data-settings="dblclick" readonly="" style="-moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4; font-size: 12px !important; line-height: 15px !important;" wrap="soft">
          # using these models
dataPoint = data.first()
prediction = lrModel.predict(dataPoint.features)
trueLabel = dataPoint.label
print 'The true label is %s, and the predict label is %s'%(trueLabel, prediction)
         </textarea>
        </div>
        <div class="crayon-main" style="">
         <table class="crayon-table">
          <tbody>
           <tr class="crayon-row">
            <td class="crayon-nums " data-settings="show">
             <div class="crayon-nums-content" style="font-size: 12px !important; line-height: 15px !important;">
              <div class="crayon-num" data-line="crayon-57685d3677e43392377067-1">
               1
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e43392377067-2">
               2
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e43392377067-3">
               3
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e43392377067-4">
               4
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e43392377067-5">
               5
              </div>
             </div>
            </td>
            <td class="crayon-code">
             <div class="crayon-pre" style="font-size: 12px !important; line-height: 15px !important; -moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4;">
              <div class="crayon-line" id="crayon-57685d3677e43392377067-1">
               <span class="crayon-c">
                # using these models
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e43392377067-2">
               <span class="crayon-v">
                dataPoint
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                data
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                first
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e43392377067-3">
               <span class="crayon-v">
                prediction
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                lrModel
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                predict
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                dataPoint
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                features
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e43392377067-4">
               <span class="crayon-v">
                trueLabel
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                dataPoint
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                label
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e43392377067-5">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                'The true label is %s, and the predict label is %s'
               </span>
               <span class="crayon-o">
                %
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                trueLabel
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                prediction
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
        </div>
       </div>
       <!-- [Format Time: 0.0023 seconds] -->
       <p>
       </p>
       <h3 id="wiz_toc_11">
        模型评估
       </h3>
       <h4 id="wiz_toc_12">
        Accuracy and Prediction Error
       </h4>
       <p>
       </p>
       <!-- Crayon Syntax Highlighter v_2.7.2_beta -->
       <div class="crayon-syntax crayon-theme-classic crayon-font-monaco crayon-os-pc print-yes notranslate" data-settings=" minimize scroll-mouseover" id="crayon-57685d3677e49044711081" style=" margin-top: 12px; margin-bottom: 12px; font-size: 12px !important; line-height: 15px !important;">
        <div class="crayon-toolbar" data-settings=" mouseover overlay hide delay" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
         <span class="crayon-title">
         </span>
         <div class="crayon-tools" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
          <div class="crayon-button crayon-nums-button" title="切换是否显示行编号">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-plain-button" title="纯文本显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-wrap-button" title="切换自动换行">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-expand-button" title="点击展开代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-copy-button" title="复制代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-popup-button" title="在新窗口中显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <span class="crayon-language">
           Python
          </span>
         </div>
        </div>
        <div class="crayon-info" style="min-height: 16.8px !important; line-height: 16.8px !important;">
        </div>
        <div class="crayon-plain-wrap">
         <textarea class="crayon-plain print-no" data-settings="dblclick" readonly="" style="-moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4; font-size: 12px !important; line-height: 15px !important;" wrap="soft">
          # Evaluating the classifier
lrTotalCorrect = data.map(lambda lp : 1 if(lrModel.predict(lp.features)==lp.label) else 0).sum()
svmTotalCorrect = data.map(lambda lp : 1 if(svmModel.predict(lp.features)==lp.label) else 0).sum()
nbTotalCorrect = nbdata.map(lambda lp: 1 if (nbModel.predict(lp.features) == lp.label) else 0).sum()
# dtTotalCorrect = data.map(lambda lp: 1 if (dtModel.predict(lp.features) == lp.label) else 0).sum()
# 要查下这里为什么会有问题，只能用后面的写法
# dtTotalCorrect = data.map(lambda lp: 1 if (dtModel.predict(lp.features) == lp.label) else 0).sum()
# predictionAndLabel = data.map(lambda lp: (dtModel.predict(lp.features),lp.label))
# print predictionAndLabel.take(5)
# dtTotalCorrect = predictionAndLabel.map(lambda (x,y): 1.0 if x==y else 0.0).sum()
# labels = data.map(lambda lp:lp.label).zip(prediction)
predictList= dtModel.predict(data.map(lambda lp: lp.features)).collect()
trueLabel = data.map(lambda lp: lp.label).collect()
# # diff = abs(predictList-trueLabel)
dtTotalCorrect = sum([1.0 if predictVal == trueLabel[i] else 0.0 for i, predictVal in enumerate(predictList)])
# dtTotalCorrect = sum(diff)
# print dtTotalCorrect
lrAccuracy = lrTotalCorrect/(data.count()*1.0)
svmAccuracy = svmTotalCorrect/(data.count()*1.0)
nbAccuracy = nbTotalCorrect/(1.0*nbdata.count())
dtAccuracy = dtTotalCorrect/(1.0*data.count())
print '------------data count: %s------------'%data.count()
print '------------lr Model Accuracy: %s------------'%lrAccuracy
print '------------svm Model Accuracy: %f------------'%svmAccuracy
print '------------nb Model Accuracy: %f------------'%nbAccuracy
print '------------dt Model Accuracy: %f------------'%dtAccuracy
print '-----------------------done-----------------------'
         </textarea>
        </div>
        <div class="crayon-main" style="">
         <table class="crayon-table">
          <tbody>
           <tr class="crayon-row">
            <td class="crayon-nums " data-settings="show">
             <div class="crayon-nums-content" style="font-size: 12px !important; line-height: 15px !important;">
              <div class="crayon-num" data-line="crayon-57685d3677e49044711081-1">
               1
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e49044711081-2">
               2
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e49044711081-3">
               3
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e49044711081-4">
               4
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e49044711081-5">
               5
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e49044711081-6">
               6
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e49044711081-7">
               7
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e49044711081-8">
               8
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e49044711081-9">
               9
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e49044711081-10">
               10
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e49044711081-11">
               11
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e49044711081-12">
               12
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e49044711081-13">
               13
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e49044711081-14">
               14
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e49044711081-15">
               15
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e49044711081-16">
               16
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e49044711081-17">
               17
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e49044711081-18">
               18
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e49044711081-19">
               19
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e49044711081-20">
               20
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e49044711081-21">
               21
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e49044711081-22">
               22
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e49044711081-23">
               23
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e49044711081-24">
               24
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e49044711081-25">
               25
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e49044711081-26">
               26
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e49044711081-27">
               27
              </div>
             </div>
            </td>
            <td class="crayon-code">
             <div class="crayon-pre" style="font-size: 12px !important; line-height: 15px !important; -moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4;">
              <div class="crayon-line" id="crayon-57685d3677e49044711081-1">
               <span class="crayon-c">
                # Evaluating the classifier
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e49044711081-2">
               <span class="crayon-v">
                lrTotalCorrect
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                data
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                lp
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                1
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                if
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                lrModel
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                predict
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                lp
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                features
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-o">
                ==
               </span>
               <span class="crayon-v">
                lp
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                label
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                else
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                0
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                sum
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e49044711081-3">
               <span class="crayon-v">
                svmTotalCorrect
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                data
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                lp
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                1
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                if
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                svmModel
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                predict
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                lp
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                features
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-o">
                ==
               </span>
               <span class="crayon-v">
                lp
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                label
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                else
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                0
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                sum
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e49044711081-4">
               <span class="crayon-v">
                nbTotalCorrect
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                nbdata
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                lp
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                1
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                if
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                nbModel
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                predict
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                lp
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                features
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                ==
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                lp
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                label
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                else
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                0
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                sum
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e49044711081-5">
               <span class="crayon-c">
                # dtTotalCorrect = data.map(lambda lp: 1 if (dtModel.predict(lp.features) == lp.label) else 0).sum()
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e49044711081-6">
               <span class="crayon-c">
                # 要查下这里为什么会有问题，只能用后面的写法
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e49044711081-7">
               <span class="crayon-c">
                # dtTotalCorrect = data.map(lambda lp: 1 if (dtModel.predict(lp.features) == lp.label) else 0).sum()
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e49044711081-8">
               <span class="crayon-c">
                # predictionAndLabel = data.map(lambda lp: (dtModel.predict(lp.features),lp.label))
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e49044711081-9">
               <span class="crayon-c">
                # print predictionAndLabel.take(5)
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e49044711081-10">
               <span class="crayon-c">
                # dtTotalCorrect = predictionAndLabel.map(lambda (x,y): 1.0 if x==y else 0.0).sum()
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e49044711081-11">
               <span class="crayon-c">
                # labels = data.map(lambda lp:lp.label).zip(prediction)
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e49044711081-12">
               <span class="crayon-v">
                predictList
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                dtModel
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                predict
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                data
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                lp
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                lp
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                features
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                collect
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e49044711081-13">
               <span class="crayon-v">
                trueLabel
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                data
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                lp
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                lp
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                label
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                collect
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e49044711081-14">
               <span class="crayon-c">
                # # diff = abs(predictList-trueLabel)
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e49044711081-15">
               <span class="crayon-v">
                dtTotalCorrect
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-k ">
                sum
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-cn">
                1.0
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                if
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                predictVal
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                ==
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                trueLabel
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-v">
                i
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                else
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                0.0
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                for
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                i
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                predictVal
               </span>
               <span class="crayon-st">
                in
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-k ">
                enumerate
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                predictList
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e49044711081-16">
               <span class="crayon-c">
                # dtTotalCorrect = sum(diff)
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e49044711081-17">
               <span class="crayon-c">
                # print dtTotalCorrect
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e49044711081-18">
               <span class="crayon-v">
                lrAccuracy
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                lrTotalCorrect
               </span>
               <span class="crayon-o">
                /
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                data
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                count
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-o">
                *
               </span>
               <span class="crayon-cn">
                1.0
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e49044711081-19">
               <span class="crayon-v">
                svmAccuracy
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                svmTotalCorrect
               </span>
               <span class="crayon-o">
                /
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                data
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                count
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-o">
                *
               </span>
               <span class="crayon-cn">
                1.0
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e49044711081-20">
               <span class="crayon-v">
                nbAccuracy
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                nbTotalCorrect
               </span>
               <span class="crayon-o">
                /
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-cn">
                1.0
               </span>
               <span class="crayon-o">
                *
               </span>
               <span class="crayon-v">
                nbdata
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                count
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e49044711081-21">
               <span class="crayon-v">
                dtAccuracy
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                dtTotalCorrect
               </span>
               <span class="crayon-o">
                /
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-cn">
                1.0
               </span>
               <span class="crayon-o">
                *
               </span>
               <span class="crayon-v">
                data
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                count
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e49044711081-22">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                '------------data count: %s------------'
               </span>
               <span class="crayon-o">
                %
               </span>
               <span class="crayon-v">
                data
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                count
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e49044711081-23">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                '------------lr Model Accuracy: %s------------'
               </span>
               <span class="crayon-o">
                %
               </span>
               <span class="crayon-e">
                lrAccuracy
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e49044711081-24">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                '------------svm Model Accuracy: %f------------'
               </span>
               <span class="crayon-o">
                %
               </span>
               <span class="crayon-e">
                svmAccuracy
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e49044711081-25">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                '------------nb Model Accuracy: %f------------'
               </span>
               <span class="crayon-o">
                %
               </span>
               <span class="crayon-e">
                nbAccuracy
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e49044711081-26">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                '------------dt Model Accuracy: %f------------'
               </span>
               <span class="crayon-o">
                %
               </span>
               <span class="crayon-e">
                dtAccuracy
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e49044711081-27">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                '-----------------------done-----------------------'
               </span>
              </div>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
        </div>
       </div>
       <!-- [Format Time: 0.0689 seconds] -->
       <p>
       </p>
       <p>
        模型Accuracy：
        <br/>
        <img src="http://7xr3b9.com1.z0.glb.clouddn.com/mac_blogs_classifier_evalution.png"/>
       </p>
       <h5 id="wiz_toc_13">
        Precision and Recall
       </h5>
       <p>
        有了前面的Accuracy，为什么又要多一个Precision and Recall呢？其实，评估标准在机器学习里面算是特别重要的一块，具体可以看看
        <a href="http://hacker.duanshishi.com/?p=1423">
         机器学习模型评估
        </a>
        ，需要指出的是，Precision and Recall在这篇文章中讲的是Ranking Metrics，原理差不多都是一个准确率和召回率的综合考虑,抛开召回率，单独谈准确率是一个非常不专业的行为，下图是一个spark中各种metrics的基本解释：
        <br/>
        <img src="http://7xr3b9.com1.z0.glb.clouddn.com/mac_blogs_classifier_metrics.png"/>
        <br/>
        一个分类器的Percision-recall curve:
        <br/>
        <img src="http://7xr3b9.com1.z0.glb.clouddn.com/mac_blogs_classifier_PR_curve.png"/>
       </p>
       <h5 id="wiz_toc_14">
        ROC curve and AUC
       </h5>
       <p>
        ROC和PR曲线类似，用来表明特点False Positive Rate下的True Positive Rate,这里我就直接用英文表示了，感觉翻译的真阳性、假阳性感觉好二。举个例子来说明，一个垃圾邮件分类器，TPR表示的是所有被正确分类为垃圾邮件的数量与所有垃圾邮件数量的比值，FPR表示所有被判断为垃圾邮件的正常邮件与所有正常邮件的比值。FPR和TPR构建x,y坐标轴，然后就会有对应的ROC Curve。
       </p>
       <!-- Crayon Syntax Highlighter v_2.7.2_beta -->
       <div class="crayon-syntax crayon-theme-classic crayon-font-monaco crayon-os-pc print-yes notranslate" data-settings=" minimize scroll-mouseover" id="crayon-57685d3677e52191605956" style=" margin-top: 12px; margin-bottom: 12px; font-size: 12px !important; line-height: 15px !important;">
        <div class="crayon-toolbar" data-settings=" mouseover overlay hide delay" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
         <span class="crayon-title">
         </span>
         <div class="crayon-tools" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
          <div class="crayon-button crayon-nums-button" title="切换是否显示行编号">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-plain-button" title="纯文本显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-wrap-button" title="切换自动换行">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-expand-button" title="点击展开代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-copy-button" title="复制代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-popup-button" title="在新窗口中显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <span class="crayon-language">
           Python
          </span>
         </div>
        </div>
        <div class="crayon-info" style="min-height: 16.8px !important; line-height: 16.8px !important;">
        </div>
        <div class="crayon-plain-wrap">
         <textarea class="crayon-plain print-no" data-settings="dblclick" readonly="" style="-moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4; font-size: 12px !important; line-height: 15px !important;" wrap="soft">
          # 计算AUC、和AUPR
# import pyspark.mllib.evaluation.BinaryClassificationMetrics
from pyspark.mllib.evaluation import BinaryClassificationMetrics
all_models_metrics = []
for model in [lrModel,svmModel]:
    scoresAndLabels = data.map(lambda point:(model.predict(point.features),point.label)).collect()
    scoresAndLabels = [(float(i),j) for (i,j) in scoresAndLabels]
    scoresAndLabels_sc = sc.parallelize(scoresAndLabels)
    metrics = BinaryClassificationMetrics(scoresAndLabels_sc)
    all_models_metrics.append((model.__class__.__name__,metrics.areaUnderROC, metrics.areaUnderPR))
print all_models_metrics
for model in [nbModel]:
    # float(model.predict(point.features)) is important or get a error 
    #'DoubleType can not accept object in type &lt;type 'numpy.float64'&gt;'
    scoresAndLabels = nbdata.map(lambda point:(float(model.predict(point.features)),point.label)).collect()
    #scoresAndLabeles = [(1.0*i,j) for (i,j) in scoresAndLabeles]
    #print scoresAndLabeles
    scoresAndLabels_sc = sc.parallelize(scoresAndLabels)
    #print scoresAndLabeles
    scoresAndLabeles_sc = scoresAndLabels_sc
    nb_metrics = BinaryClassificationMetrics(scoresAndLabels_sc)
    all_models_metrics.append((model.__class__.__name__, nb_metrics.areaUnderROC, nb_metrics.areaUnderPR))
print all_models_metrics
for model in [dtModel]:
#     scoresAndLabeles = data.map(lambda point:(model.predict(point.features),point.label)).collect()
    predictList= dtModel.predict(data.map(lambda lp: lp.features)).collect()
    trueLabel = data.map(lambda lp: lp.label).collect()
#     scoresAndLabeles = [(1.0*i,j) for (i,j) in scoresAndLabeles]
#     print scoresAndLabeles
    scoresAndLabels = [(predictList[i],true_val) for i, true_val in enumerate(trueLabel)]
    scoresAndLabels_sc = sc.parallelize(scoresAndLabels)
#     print scoresAndLabeles
    scoresAndLabels_sc = scoresAndLabels_sc.map(lambda (x,y): (float(x),float(y)))
    dt_metrics = BinaryClassificationMetrics(scoresAndLabels_sc)
    all_models_metrics.append((model.__class__.__name__, dt_metrics.areaUnderROC, dt_metrics.areaUnderPR))
print all_models_metrics
         </textarea>
        </div>
        <div class="crayon-main" style="">
         <table class="crayon-table">
          <tbody>
           <tr class="crayon-row">
            <td class="crayon-nums " data-settings="show">
             <div class="crayon-nums-content" style="font-size: 12px !important; line-height: 15px !important;">
              <div class="crayon-num" data-line="crayon-57685d3677e52191605956-1">
               1
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e52191605956-2">
               2
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e52191605956-3">
               3
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e52191605956-4">
               4
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e52191605956-5">
               5
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e52191605956-6">
               6
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e52191605956-7">
               7
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e52191605956-8">
               8
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e52191605956-9">
               9
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e52191605956-10">
               10
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e52191605956-11">
               11
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e52191605956-12">
               12
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e52191605956-13">
               13
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e52191605956-14">
               14
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e52191605956-15">
               15
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e52191605956-16">
               16
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e52191605956-17">
               17
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e52191605956-18">
               18
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e52191605956-19">
               19
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e52191605956-20">
               20
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e52191605956-21">
               21
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e52191605956-22">
               22
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e52191605956-23">
               23
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e52191605956-24">
               24
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e52191605956-25">
               25
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e52191605956-26">
               26
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e52191605956-27">
               27
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e52191605956-28">
               28
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e52191605956-29">
               29
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e52191605956-30">
               30
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e52191605956-31">
               31
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e52191605956-32">
               32
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e52191605956-33">
               33
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e52191605956-34">
               34
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e52191605956-35">
               35
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e52191605956-36">
               36
              </div>
             </div>
            </td>
            <td class="crayon-code">
             <div class="crayon-pre" style="font-size: 12px !important; line-height: 15px !important; -moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4;">
              <div class="crayon-line" id="crayon-57685d3677e52191605956-1">
               <span class="crayon-c">
                # 计算AUC、和AUPR
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e52191605956-2">
               <span class="crayon-c">
                # import pyspark.mllib.evaluation.BinaryClassificationMetrics
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e52191605956-3">
               <span class="crayon-st">
                from
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                pyspark
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                mllib
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                evaluation
               </span>
               <span class="crayon-r">
                import
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                BinaryClassificationMetrics
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e52191605956-4">
               <span class="crayon-v">
                all_models_metrics
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-sy">
                ]
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e52191605956-5">
               <span class="crayon-st">
                for
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                model
               </span>
               <span class="crayon-st">
                in
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-v">
                lrModel
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                svmModel
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-o">
                :
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e52191605956-6">
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                scoresAndLabels
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                data
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                point
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                model
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                predict
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                point
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                features
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                point
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                label
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                collect
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e52191605956-7">
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                scoresAndLabels
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-k ">
                float
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                i
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                j
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                for
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                i
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                j
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                in
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                scoresAndLabels
               </span>
               <span class="crayon-sy">
                ]
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e52191605956-8">
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                scoresAndLabels_sc
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                sc
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                parallelize
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                scoresAndLabels
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e52191605956-9">
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                metrics
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                BinaryClassificationMetrics
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                scoresAndLabels_sc
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e52191605956-10">
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                all_models_metrics
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                append
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                model
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                __class__
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                __name__
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                metrics
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                areaUnderROC
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                metrics
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                areaUnderPR
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e52191605956-11">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                all_models_metrics
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e52191605956-12">
               <span class="crayon-st">
                for
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                model
               </span>
               <span class="crayon-st">
                in
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-v">
                nbModel
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-o">
                :
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e52191605956-13">
               <span class="crayon-h">
               </span>
               <span class="crayon-c">
                # float(model.predict(point.features)) is important or get a error
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e52191605956-14">
               <span class="crayon-h">
               </span>
               <span class="crayon-c">
                #'DoubleType can not accept object in type &lt;type 'numpy.float64'&gt;'
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e52191605956-15">
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                scoresAndLabels
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                nbdata
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                point
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-k ">
                float
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                model
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                predict
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                point
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                features
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                point
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                label
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                collect
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e52191605956-16">
               <span class="crayon-h">
               </span>
               <span class="crayon-c">
                #scoresAndLabeles = [(1.0*i,j) for (i,j) in scoresAndLabeles]
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e52191605956-17">
               <span class="crayon-h">
               </span>
               <span class="crayon-c">
                #print scoresAndLabeles
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e52191605956-18">
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                scoresAndLabels_sc
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                sc
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                parallelize
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                scoresAndLabels
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e52191605956-19">
               <span class="crayon-h">
               </span>
               <span class="crayon-c">
                #print scoresAndLabeles
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e52191605956-20">
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                scoresAndLabeles_sc
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                scoresAndLabels_sc
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e52191605956-21">
               <span class="crayon-e">
               </span>
               <span class="crayon-v">
                nb_metrics
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                BinaryClassificationMetrics
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                scoresAndLabels_sc
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e52191605956-22">
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                all_models_metrics
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                append
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                model
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                __class__
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                __name__
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                nb_metrics
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                areaUnderROC
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                nb_metrics
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                areaUnderPR
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e52191605956-23">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                all_models_metrics
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e52191605956-24">
               <span class="crayon-st">
                for
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                model
               </span>
               <span class="crayon-st">
                in
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-v">
                dtModel
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-o">
                :
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e52191605956-25">
               <span class="crayon-c">
                #     scoresAndLabeles = data.map(lambda point:(model.predict(point.features),point.label)).collect()
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e52191605956-26">
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                predictList
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                dtModel
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                predict
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                data
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                lp
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                lp
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                features
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                collect
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e52191605956-27">
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                trueLabel
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                data
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                lp
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                lp
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                label
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                collect
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e52191605956-28">
               <span class="crayon-c">
                #     scoresAndLabeles = [(1.0*i,j) for (i,j) in scoresAndLabeles]
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e52191605956-29">
               <span class="crayon-c">
                #     print scoresAndLabeles
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e52191605956-30">
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                scoresAndLabels
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                predictList
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-v">
                i
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                true_val
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                for
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                i
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                true_val
               </span>
               <span class="crayon-st">
                in
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-k ">
                enumerate
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                trueLabel
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                ]
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e52191605956-31">
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                scoresAndLabels_sc
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                sc
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                parallelize
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                scoresAndLabels
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e52191605956-32">
               <span class="crayon-c">
                #     print scoresAndLabeles
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e52191605956-33">
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                scoresAndLabels_sc
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                scoresAndLabels_sc
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                y
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-k ">
                float
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-k ">
                float
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                y
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e52191605956-34">
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                dt_metrics
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                BinaryClassificationMetrics
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                scoresAndLabels_sc
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e52191605956-35">
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                all_models_metrics
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                append
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                model
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                __class__
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                __name__
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                dt_metrics
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                areaUnderROC
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                dt_metrics
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                areaUnderPR
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e52191605956-36">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                all_models_metrics
               </span>
              </div>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
        </div>
       </div>
       <!-- [Format Time: 0.0696 seconds] -->
       <p>
       </p>
       <p>
        <img src="http://7xr3b9.com1.z0.glb.clouddn.com/mac_blogs_classifiers_results.png"/>
       </p>
       <h3 id="wiz_toc_15">
        模型调参、提高模型性能
       </h3>
       <h4 id="wiz_toc_16">
        特征标准化
       </h4>
       <p>
        在机器学习的方法中，对特征进行标准化是特别重要的工作，何为standardization？!举个例子，小明数学考了82分、语文考了90分，那我们能说明小明语文考的比数学好吗 ？显然不是，我们必须知道全班其他学生的考试情况，才能对比小明语文和数学谁考的更好，那么说了这么多，到底为啥要做standardization呢？这里截取了一张Andrew Ng课程上的截图来说明：
        <br/>
        <img src="http://7xr3b9.com1.z0.glb.clouddn.com/mac_blogs_feature_standardization.png"/>
        <br/>
        <img src="http://7xr3b9.com1.z0.glb.clouddn.com/mac_blogs_reason_feature_standardzation.png"/>
        <br/>
        如果不在同一个标准下，很容易出现左图中的情况，这样一个寻优路径上很容易为”之”字形，而右图则相对于左图的”之”字形能快速寻优，达到更快速的收敛，在一定程度上提高模型精确性。
       </p>
       <!-- Crayon Syntax Highlighter v_2.7.2_beta -->
       <div class="crayon-syntax crayon-theme-classic crayon-font-monaco crayon-os-pc print-yes notranslate" data-settings=" minimize scroll-mouseover" id="crayon-57685d3677e5c333121444" style=" margin-top: 12px; margin-bottom: 12px; font-size: 12px !important; line-height: 15px !important;">
        <div class="crayon-toolbar" data-settings=" mouseover overlay hide delay" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
         <span class="crayon-title">
         </span>
         <div class="crayon-tools" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
          <div class="crayon-button crayon-nums-button" title="切换是否显示行编号">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-plain-button" title="纯文本显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-wrap-button" title="切换自动换行">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-expand-button" title="点击展开代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-copy-button" title="复制代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-popup-button" title="在新窗口中显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <span class="crayon-language">
           Python
          </span>
         </div>
        </div>
        <div class="crayon-info" style="min-height: 16.8px !important; line-height: 16.8px !important;">
        </div>
        <div class="crayon-plain-wrap">
         <textarea class="crayon-plain print-no" data-settings="dblclick" readonly="" style="-moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4; font-size: 12px !important; line-height: 15px !important;" wrap="soft">
          from pyspark.mllib.feature import StandardScalerModel,StandardScaler
scaler = StandardScaler(withMean=True, withStd=True).fit(vectors)
labels = data.map(lambda lp: lp.label)
features = data.map(lambda lp: lp.features)
print features.take(5)
scaled_data = labels.zip(scaler.transform(features))
scaled_data = scaled_data.map(lambda (x,y): LabeledPoint(x,y))
print scaled_data.first().features
print data.first().features
# 用标准化数据来训练lr模型
lrModelScaled = LogisticRegressionWithSGD.train(scaled_data, numIteration)
lrTotalCorrectScaled = scaled_data.map(lambda lp : 1 if(lrModelScaled.predict(lp.features)==lp.label) else 0).sum()
lrAccuracyScaled = lrTotalCorrectScaled/(1.0*data.count())
print 'lrAccuracyscaled : %f'%lrAccuracyScaled
all_models_metrics =[]
for model in [lrModelScaled]:
    scoresAndLabels = scaled_data.map(lambda point:(model.predict(point.features),point.label)).collect()
    scoresAndLabels = [(float(i),j) for (i,j) in scoresAndLabels]
    scoresAndLabels_sc = sc.parallelize(scoresAndLabels)
    metrics = BinaryClassificationMetrics(scoresAndLabels_sc)
    all_models_metrics.append((model.__class__.__name__,metrics.areaUnderROC, metrics.areaUnderPR))

print all_models_metrics
         </textarea>
        </div>
        <div class="crayon-main" style="">
         <table class="crayon-table">
          <tbody>
           <tr class="crayon-row">
            <td class="crayon-nums " data-settings="show">
             <div class="crayon-nums-content" style="font-size: 12px !important; line-height: 15px !important;">
              <div class="crayon-num" data-line="crayon-57685d3677e5c333121444-1">
               1
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e5c333121444-2">
               2
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e5c333121444-3">
               3
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e5c333121444-4">
               4
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e5c333121444-5">
               5
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e5c333121444-6">
               6
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e5c333121444-7">
               7
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e5c333121444-8">
               8
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e5c333121444-9">
               9
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e5c333121444-10">
               10
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e5c333121444-11">
               11
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e5c333121444-12">
               12
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e5c333121444-13">
               13
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e5c333121444-14">
               14
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e5c333121444-15">
               15
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e5c333121444-16">
               16
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e5c333121444-17">
               17
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e5c333121444-18">
               18
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e5c333121444-19">
               19
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e5c333121444-20">
               20
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e5c333121444-21">
               21
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e5c333121444-22">
               22
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e5c333121444-23">
               23
              </div>
             </div>
            </td>
            <td class="crayon-code">
             <div class="crayon-pre" style="font-size: 12px !important; line-height: 15px !important; -moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4;">
              <div class="crayon-line" id="crayon-57685d3677e5c333121444-1">
               <span class="crayon-st">
                from
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                pyspark
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                mllib
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                feature
               </span>
               <span class="crayon-r">
                import
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                StandardScalerModel
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-e">
                StandardScaler
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e5c333121444-2">
               <span class="crayon-v">
                scaler
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                StandardScaler
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                withMean
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-t">
                True
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                withStd
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-t">
                True
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                fit
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                vectors
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e5c333121444-3">
               <span class="crayon-v">
                labels
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                data
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                lp
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                lp
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                label
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e5c333121444-4">
               <span class="crayon-v">
                features
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                data
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                lp
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                lp
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                features
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e5c333121444-5">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                features
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                take
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-cn">
                5
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e5c333121444-6">
               <span class="crayon-v">
                scaled_data
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                labels
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                zip
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                scaler
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                transform
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                features
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e5c333121444-7">
               <span class="crayon-v">
                scaled_data
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                scaled_data
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                y
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                LabeledPoint
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                y
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e5c333121444-8">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                scaled_data
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                first
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                features
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e5c333121444-9">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                data
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                first
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                features
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e5c333121444-10">
               <span class="crayon-c">
                # 用标准化数据来训练lr模型
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e5c333121444-11">
               <span class="crayon-v">
                lrModelScaled
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                LogisticRegressionWithSGD
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                train
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                scaled_data
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                numIteration
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e5c333121444-12">
               <span class="crayon-v">
                lrTotalCorrectScaled
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                scaled_data
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                lp
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                1
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                if
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                lrModelScaled
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                predict
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                lp
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                features
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-o">
                ==
               </span>
               <span class="crayon-v">
                lp
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                label
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                else
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                0
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                sum
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e5c333121444-13">
               <span class="crayon-v">
                lrAccuracyScaled
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                lrTotalCorrectScaled
               </span>
               <span class="crayon-o">
                /
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-cn">
                1.0
               </span>
               <span class="crayon-o">
                *
               </span>
               <span class="crayon-v">
                data
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                count
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e5c333121444-14">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                'lrAccuracyscaled : %f'
               </span>
               <span class="crayon-o">
                %
               </span>
               <span class="crayon-e">
                lrAccuracyScaled
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e5c333121444-15">
               <span class="crayon-v">
                all_models_metrics
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-sy">
                ]
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e5c333121444-16">
               <span class="crayon-st">
                for
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                model
               </span>
               <span class="crayon-st">
                in
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-v">
                lrModelScaled
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-o">
                :
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e5c333121444-17">
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                scoresAndLabels
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                scaled_data
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                point
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                model
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                predict
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                point
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                features
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                point
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                label
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                collect
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e5c333121444-18">
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                scoresAndLabels
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-k ">
                float
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                i
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                j
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                for
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                i
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                j
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                in
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                scoresAndLabels
               </span>
               <span class="crayon-sy">
                ]
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e5c333121444-19">
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                scoresAndLabels_sc
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                sc
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                parallelize
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                scoresAndLabels
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e5c333121444-20">
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                metrics
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                BinaryClassificationMetrics
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                scoresAndLabels_sc
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e5c333121444-21">
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                all_models_metrics
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                append
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                model
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                __class__
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                __name__
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                metrics
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                areaUnderROC
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                metrics
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                areaUnderPR
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e5c333121444-22">
              </div>
              <div class="crayon-line" id="crayon-57685d3677e5c333121444-23">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                all_models_metrics
               </span>
              </div>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
        </div>
       </div>
       <!-- [Format Time: 0.0481 seconds] -->
       <p>
       </p>
       <p>
        Accuracy:0.620960
        <br/>
        <img src="http://7xr3b9.com1.z0.glb.clouddn.com/mac_blogs_lr_standardization1.png"/>
        <br/>
        最终结果，相对于未标准化的数据模型在accuracy和AUC上有比较明显的提升，PR为啥没有提升，不是特别清楚，书上也没有说。。。
       </p>
       <h4 id="wiz_toc_17">
        增加数据特征
       </h4>
       <p>
        这里我们将原始数据中的第4列（category variable）编码为K为二值变量（dummies）：
       </p>
       <!-- Crayon Syntax Highlighter v_2.7.2_beta -->
       <div class="crayon-syntax crayon-theme-classic crayon-font-monaco crayon-os-pc print-yes notranslate" data-settings=" minimize scroll-mouseover" id="crayon-57685d3677e63595657247" style=" margin-top: 12px; margin-bottom: 12px; font-size: 12px !important; line-height: 15px !important;">
        <div class="crayon-toolbar" data-settings=" mouseover overlay hide delay" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
         <span class="crayon-title">
         </span>
         <div class="crayon-tools" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
          <div class="crayon-button crayon-nums-button" title="切换是否显示行编号">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-plain-button" title="纯文本显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-wrap-button" title="切换自动换行">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-expand-button" title="点击展开代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-copy-button" title="复制代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-popup-button" title="在新窗口中显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <span class="crayon-language">
           Python
          </span>
         </div>
        </div>
        <div class="crayon-info" style="min-height: 16.8px !important; line-height: 16.8px !important;">
        </div>
        <div class="crayon-plain-wrap">
         <textarea class="crayon-plain print-no" data-settings="dblclick" readonly="" style="-moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4; font-size: 12px !important; line-height: 15px !important;" wrap="soft">
          categories = records.map(lambda x: x[3]).distinct().zipWithIndex().collect()
category_dict = {}
categories
for  (x,y) in [(key.replace('\"','') ,val) for (key, val) in categories]:
    category_dict[x] = y
num_categories = len(category_dict)
otherdata = trimmed.map(lambda x:(x[-1],x[4:-1])).map(lambda (x,y): (x.replace("\"","") ,[ 0.0 if yy =='\"?\"' else yy.replace("\"","") for yy in y])).map(lambda (x,y):(int(x),[float(yy) for yy in y])).map(lambda (x,y):LabeledPoint(x,Vectors.dense(y)))
otherdata.take(5)

def func1(x):
# 这里把前面的合在一起做了，然后最终把category_feature和other_feature合在一起
    import numpy as np
    label = x[-1].replace('\"','')
    other_feature = [0.0 if yy == '?' else yy for yy in [ y.replace('\"','') for y in x[4:-1]]]
    category_Idx = category_dict[x[3].replace('\"','')]
    category_feature = np.zeros(num_categories)
    category_feature[category_Idx] = 1
    return LabeledPoint(label, Vectors.dense(list(category_feature)+other_feature))
category_data = trimmed.map(lambda x:func1(x))
category_data.take(5)
# category_data.take(5)
category_labels = category_data.map(lambda lp: lp.label)
category_features = category_data.map(lambda lp: lp.features)
scaler2 = StandardScaler(withMean=True, withStd=True).fit(category_features)
print category_features.take(5)
scaled_category_data = category_labels.zip(scaler2.transform(category_features))
scaled_category_data = scaled_category_data.map(lambda (x,y): LabeledPoint(x,y))
print scaled_category_data.take(5)

# 取出label和features，然后对features做Standardization
category_labels = category_data.map(lambda lp: lp.label)
category_features = category_data.map(lambda lp: lp.features)
scaler2 = StandardScaler(withMean=True, withStd=True).fit(category_features)
print category_features.take(5) 
scaled_category_data = category_labels.zip(scaler2.transform(category_features))
scaled_category_data = scaled_category_data.map(lambda (x,y): LabeledPoint(x,y))
print scaled_category_data.take(5)

# fit添加了category var的数据
lrModel_category_scaled = LogisticRegressionWithSGD.train(scaled_category_data, numIteration)
lr_totalCorrect_category_scaled = scaled_category_data.map(lambda lp : 1            if(lrModel_category_scaled.predict(lp.features)==lp.label) else 0).sum()
lr_accuracy_category_scaled = lr_totalCorrect_category_scaled/(1.0*data.count())
print 'lrModel_category_scaled : %f'%lr_accuracy_category_scaled

all_models_metrics =[]
for model in [lrModel_category_scaled]:
    scoresAndLabels = scaled_category_data.map(lambda point:(model.predict(point.features),point.label)).collect()
    scoresAndLabels = [(float(i),j) for (i,j) in scoresAndLabels]
    scoresAndLabels_sc = sc.parallelize(scoresAndLabels)
    metrics = BinaryClassificationMetrics(scoresAndLabels_sc)
    all_models_metrics.append((model.__class__.__name__,metrics.areaUnderROC, metrics.areaUnderPR))

print all_models_metrics
         </textarea>
        </div>
        <div class="crayon-main" style="">
         <table class="crayon-table">
          <tbody>
           <tr class="crayon-row">
            <td class="crayon-nums " data-settings="show">
             <div class="crayon-nums-content" style="font-size: 12px !important; line-height: 15px !important;">
              <div class="crayon-num" data-line="crayon-57685d3677e63595657247-1">
               1
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e63595657247-2">
               2
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e63595657247-3">
               3
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e63595657247-4">
               4
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e63595657247-5">
               5
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e63595657247-6">
               6
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e63595657247-7">
               7
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e63595657247-8">
               8
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e63595657247-9">
               9
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e63595657247-10">
               10
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e63595657247-11">
               11
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e63595657247-12">
               12
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e63595657247-13">
               13
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e63595657247-14">
               14
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e63595657247-15">
               15
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e63595657247-16">
               16
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e63595657247-17">
               17
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e63595657247-18">
               18
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e63595657247-19">
               19
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e63595657247-20">
               20
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e63595657247-21">
               21
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e63595657247-22">
               22
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e63595657247-23">
               23
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e63595657247-24">
               24
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e63595657247-25">
               25
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e63595657247-26">
               26
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e63595657247-27">
               27
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e63595657247-28">
               28
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e63595657247-29">
               29
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e63595657247-30">
               30
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e63595657247-31">
               31
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e63595657247-32">
               32
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e63595657247-33">
               33
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e63595657247-34">
               34
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e63595657247-35">
               35
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e63595657247-36">
               36
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e63595657247-37">
               37
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e63595657247-38">
               38
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e63595657247-39">
               39
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e63595657247-40">
               40
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e63595657247-41">
               41
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e63595657247-42">
               42
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e63595657247-43">
               43
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e63595657247-44">
               44
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e63595657247-45">
               45
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e63595657247-46">
               46
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e63595657247-47">
               47
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e63595657247-48">
               48
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e63595657247-49">
               49
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e63595657247-50">
               50
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e63595657247-51">
               51
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e63595657247-52">
               52
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e63595657247-53">
               53
              </div>
             </div>
            </td>
            <td class="crayon-code">
             <div class="crayon-pre" style="font-size: 12px !important; line-height: 15px !important; -moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4;">
              <div class="crayon-line" id="crayon-57685d3677e63595657247-1">
               <span class="crayon-v">
                categories
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                records
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-cn">
                3
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                distinct
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                zipWithIndex
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                collect
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e63595657247-2">
               <span class="crayon-v">
                category_dict
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                {
               </span>
               <span class="crayon-sy">
                }
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e63595657247-3">
               <span class="crayon-e">
                categories
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e63595657247-4">
               <span class="crayon-st">
                for
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                y
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                in
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                key
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                replace
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-s">
                '\"'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-s">
                ''
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                val
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                for
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                key
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                val
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                in
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                categories
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-o">
                :
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e63595657247-5">
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                category_dict
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                y
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e63595657247-6">
               <span class="crayon-v">
                num_categories
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-k ">
                len
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                category_dict
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e63595657247-7">
               <span class="crayon-v">
                otherdata
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                trimmed
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-o">
                -
               </span>
               <span class="crayon-cn">
                1
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-cn">
                4
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-o">
                -
               </span>
               <span class="crayon-cn">
                1
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                y
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                replace
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-s">
                "\""
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-s">
                ""
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                0.0
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                if
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                yy
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                ==
               </span>
               <span class="crayon-s">
                '\"?\"'
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                else
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                yy
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                replace
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-s">
                "\""
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-s">
                ""
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                for
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                yy
               </span>
               <span class="crayon-st">
                in
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                y
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                y
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-k ">
                int
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-k ">
                float
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                yy
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                for
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                yy
               </span>
               <span class="crayon-st">
                in
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                y
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                y
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-e">
                LabeledPoint
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                Vectors
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                dense
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                y
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e63595657247-8">
               <span class="crayon-v">
                otherdata
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                take
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-cn">
                5
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e63595657247-9">
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e63595657247-10">
               <span class="crayon-r">
                def
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                func1
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-o">
                :
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e63595657247-11">
               <span class="crayon-c">
                # 这里把前面的合在一起做了，然后最终把category_feature和other_feature合在一起
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e63595657247-12">
               <span class="crayon-h">
               </span>
               <span class="crayon-r">
                import
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                numpy
               </span>
               <span class="crayon-st">
                as
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                np
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e63595657247-13">
               <span class="crayon-e">
               </span>
               <span class="crayon-v">
                label
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-o">
                -
               </span>
               <span class="crayon-cn">
                1
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                replace
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-s">
                '\"'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-s">
                ''
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e63595657247-14">
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                other_feature
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-cn">
                0.0
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                if
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                yy
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                ==
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                '?'
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                else
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                yy
               </span>
               <span class="crayon-st">
                for
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                yy
               </span>
               <span class="crayon-st">
                in
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                y
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                replace
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-s">
                '\"'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-s">
                ''
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                for
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                y
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                in
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-cn">
                4
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-o">
                -
               </span>
               <span class="crayon-cn">
                1
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-sy">
                ]
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e63595657247-15">
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                category_Idx
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                category_dict
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-cn">
                3
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                replace
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-s">
                '\"'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-s">
                ''
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                ]
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e63595657247-16">
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                category_feature
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                np
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                zeros
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                num_categories
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e63595657247-17">
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                category_feature
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-v">
                category_Idx
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                1
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e63595657247-18">
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                return
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                LabeledPoint
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                label
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                Vectors
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                dense
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-k ">
                list
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                category_feature
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-o">
                +
               </span>
               <span class="crayon-v">
                other_feature
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e63595657247-19">
               <span class="crayon-v">
                category_data
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                trimmed
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-e">
                func1
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e63595657247-20">
               <span class="crayon-v">
                category_data
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                take
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-cn">
                5
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e63595657247-21">
               <span class="crayon-c">
                # category_data.take(5)
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e63595657247-22">
               <span class="crayon-v">
                category_labels
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                category_data
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                lp
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                lp
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                label
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e63595657247-23">
               <span class="crayon-v">
                category_features
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                category_data
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                lp
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                lp
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                features
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e63595657247-24">
               <span class="crayon-v">
                scaler2
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                StandardScaler
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                withMean
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-t">
                True
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                withStd
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-t">
                True
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                fit
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                category_features
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e63595657247-25">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                category_features
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                take
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-cn">
                5
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e63595657247-26">
               <span class="crayon-v">
                scaled_category_data
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                category_labels
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                zip
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                scaler2
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                transform
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                category_features
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e63595657247-27">
               <span class="crayon-v">
                scaled_category_data
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                scaled_category_data
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                y
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                LabeledPoint
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                y
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e63595657247-28">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                scaled_category_data
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                take
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-cn">
                5
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e63595657247-29">
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e63595657247-30">
               <span class="crayon-c">
                # 取出label和features，然后对features做Standardization
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e63595657247-31">
               <span class="crayon-v">
                category_labels
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                category_data
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                lp
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                lp
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                label
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e63595657247-32">
               <span class="crayon-v">
                category_features
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                category_data
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                lp
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                lp
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                features
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e63595657247-33">
               <span class="crayon-v">
                scaler2
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                StandardScaler
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                withMean
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-t">
                True
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                withStd
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-t">
                True
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                fit
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                category_features
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e63595657247-34">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                category_features
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                take
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-cn">
                5
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-h">
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e63595657247-35">
               <span class="crayon-v">
                scaled_category_data
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                category_labels
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                zip
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                scaler2
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                transform
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                category_features
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e63595657247-36">
               <span class="crayon-v">
                scaled_category_data
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                scaled_category_data
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                y
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                LabeledPoint
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                y
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e63595657247-37">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                scaled_category_data
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                take
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-cn">
                5
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e63595657247-38">
              </div>
              <div class="crayon-line" id="crayon-57685d3677e63595657247-39">
               <span class="crayon-c">
                # fit添加了category var的数据
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e63595657247-40">
               <span class="crayon-v">
                lrModel_category_scaled
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                LogisticRegressionWithSGD
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                train
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                scaled_category_data
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                numIteration
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e63595657247-41">
               <span class="crayon-v">
                lr_totalCorrect_category_scaled
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                scaled_category_data
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                lp
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                1
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                if
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                lrModel_category_scaled
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                predict
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                lp
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                features
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-o">
                ==
               </span>
               <span class="crayon-v">
                lp
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                label
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                else
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                0
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                sum
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e63595657247-42">
               <span class="crayon-v">
                lr_accuracy_category_scaled
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                lr_totalCorrect_category_scaled
               </span>
               <span class="crayon-o">
                /
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-cn">
                1.0
               </span>
               <span class="crayon-o">
                *
               </span>
               <span class="crayon-v">
                data
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                count
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e63595657247-43">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                'lrModel_category_scaled : %f'
               </span>
               <span class="crayon-o">
                %
               </span>
               <span class="crayon-e">
                lr_accuracy_category_scaled
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e63595657247-44">
              </div>
              <div class="crayon-line" id="crayon-57685d3677e63595657247-45">
               <span class="crayon-v">
                all_models_metrics
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-sy">
                ]
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e63595657247-46">
               <span class="crayon-st">
                for
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                model
               </span>
               <span class="crayon-st">
                in
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-v">
                lrModel_category_scaled
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-o">
                :
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e63595657247-47">
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                scoresAndLabels
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                scaled_category_data
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                point
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                model
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                predict
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                point
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                features
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                point
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                label
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                collect
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e63595657247-48">
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                scoresAndLabels
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-k ">
                float
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                i
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                j
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                for
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                i
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                j
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                in
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                scoresAndLabels
               </span>
               <span class="crayon-sy">
                ]
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e63595657247-49">
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                scoresAndLabels_sc
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                sc
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                parallelize
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                scoresAndLabels
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e63595657247-50">
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                metrics
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                BinaryClassificationMetrics
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                scoresAndLabels_sc
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e63595657247-51">
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                all_models_metrics
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                append
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                model
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                __class__
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                __name__
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                metrics
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                areaUnderROC
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                metrics
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                areaUnderPR
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e63595657247-52">
              </div>
              <div class="crayon-line" id="crayon-57685d3677e63595657247-53">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                all_models_metrics
               </span>
              </div>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
        </div>
       </div>
       <!-- [Format Time: 0.1192 seconds] -->
       <p>
       </p>
       <p>
        Accuray：0.665720
        <br/>
        <img src="http://7xr3b9.com1.z0.glb.clouddn.com/mac_blogs_lr_scaled_2.png"/>
        <br/>
        在添加了category variables后，分类器性能进一步提升：Accuracy由0.620960-&gt;0.665720,AUC由0.62-&gt;0.665,说明增加了这些特征数据后，是很有效的。
       </p>
       <h3 id="wiz_toc_18">
        Hyperparameter tuning
       </h3>
       <h4 id="wiz_toc_19">
        Linear Models
       </h4>
       <h5 id="wiz_toc_20">
        Iterations
       </h5>
       <p>
        这里就是如何取最优参数，具体直接看代码吧，很容易的，包括对Iterations，step size，regularization params。
       </p>
       <!-- Crayon Syntax Highlighter v_2.7.2_beta -->
       <div class="crayon-syntax crayon-theme-classic crayon-font-monaco crayon-os-pc print-yes notranslate" data-settings=" minimize scroll-mouseover" id="crayon-57685d3677e6e778775216" style=" margin-top: 12px; margin-bottom: 12px; font-size: 12px !important; line-height: 15px !important;">
        <div class="crayon-toolbar" data-settings=" mouseover overlay hide delay" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
         <span class="crayon-title">
         </span>
         <div class="crayon-tools" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
          <div class="crayon-button crayon-nums-button" title="切换是否显示行编号">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-plain-button" title="纯文本显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-wrap-button" title="切换自动换行">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-expand-button" title="点击展开代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-copy-button" title="复制代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-popup-button" title="在新窗口中显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <span class="crayon-language">
           Python
          </span>
         </div>
        </div>
        <div class="crayon-info" style="min-height: 16.8px !important; line-height: 16.8px !important;">
        </div>
        <div class="crayon-plain-wrap">
         <textarea class="crayon-plain print-no" data-settings="dblclick" readonly="" style="-moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4; font-size: 12px !important; line-height: 15px !important;" wrap="soft">
          def train_with_params(input, reg_param, num_iter, step_size):
    lr_model = LogisticRegressionWithSGD.train(input,iterations=num_iter, regParam=reg_param, step=step_size)
    return lr_model
def create_metrics(tag, data, model):
    score_labels = data.map(lambda x: (model.predict(x.features)*1.0,x.label*1.0))
#     score_labels_sc = sc.parallelize(score_labels)
    metrics = BinaryClassificationMetrics(score_labels)
    return tag,metrics.areaUnderROC

for i in [1,5,10,50]:
    model = train_with_params(scaled_category_data, 0.0, i, 1.0)
    label, roc = create_metrics('%d iterations'%i,scaled_category_data,model)
    print '%s,AUC = %2.2f%%'%(label,roc*100)
for s in [0.001, 0.01, 0.1, 1.0, 10.0]:
    model = train_with_params(scaled_category_data, 0.0, 10, s)
    label, roc = create_metrics('%f step size'%s,scaled_category_data,model)
    print '%s,AUC = %2.2f%%'%(label,roc*100)
for r in [0.001, 0.01, 0.1, 1.0, 10.0]:
    model = train_with_params(scaled_category_data, 0.0, 1.0, r)
    label, roc = create_metrics('%f regularization parameter'%r,scaled_category_data,model)
    print '%s,AUC = %2.2f%%'%(label,roc*100)
         </textarea>
        </div>
        <div class="crayon-main" style="">
         <table class="crayon-table">
          <tbody>
           <tr class="crayon-row">
            <td class="crayon-nums " data-settings="show">
             <div class="crayon-nums-content" style="font-size: 12px !important; line-height: 15px !important;">
              <div class="crayon-num" data-line="crayon-57685d3677e6e778775216-1">
               1
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e6e778775216-2">
               2
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e6e778775216-3">
               3
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e6e778775216-4">
               4
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e6e778775216-5">
               5
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e6e778775216-6">
               6
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e6e778775216-7">
               7
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e6e778775216-8">
               8
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e6e778775216-9">
               9
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e6e778775216-10">
               10
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e6e778775216-11">
               11
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e6e778775216-12">
               12
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e6e778775216-13">
               13
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e6e778775216-14">
               14
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e6e778775216-15">
               15
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e6e778775216-16">
               16
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e6e778775216-17">
               17
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e6e778775216-18">
               18
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e6e778775216-19">
               19
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e6e778775216-20">
               20
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e6e778775216-21">
               21
              </div>
             </div>
            </td>
            <td class="crayon-code">
             <div class="crayon-pre" style="font-size: 12px !important; line-height: 15px !important; -moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4;">
              <div class="crayon-line" id="crayon-57685d3677e6e778775216-1">
               <span class="crayon-r">
                def
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                train_with_params
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-k ">
                input
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                reg_param
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                num_iter
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                step_size
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-o">
                :
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e6e778775216-2">
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                lr_model
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                LogisticRegressionWithSGD
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                train
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-k ">
                input
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                iterations
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-v">
                num_iter
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                regParam
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-v">
                reg_param
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                step
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-v">
                step_size
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e6e778775216-3">
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                return
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                lr_model
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e6e778775216-4">
               <span class="crayon-r">
                def
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                create_metrics
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                tag
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                data
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                model
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-o">
                :
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e6e778775216-5">
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                score_labels
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                data
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                model
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                predict
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                features
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-o">
                *
               </span>
               <span class="crayon-cn">
                1.0
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                label
               </span>
               <span class="crayon-o">
                *
               </span>
               <span class="crayon-cn">
                1.0
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e6e778775216-6">
               <span class="crayon-c">
                #     score_labels_sc = sc.parallelize(score_labels)
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e6e778775216-7">
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                metrics
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                BinaryClassificationMetrics
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                score_labels
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e6e778775216-8">
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                return
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                tag
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                metrics
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                areaUnderROC
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e6e778775216-9">
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e6e778775216-10">
               <span class="crayon-st">
                for
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                i
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                in
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-cn">
                1
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-cn">
                5
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-cn">
                10
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-cn">
                50
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-o">
                :
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e6e778775216-11">
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                model
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                train_with_params
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                scaled_category_data
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                0.0
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                i
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                1.0
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e6e778775216-12">
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                label
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                roc
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                create_metrics
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-s">
                '%d iterations'
               </span>
               <span class="crayon-o">
                %
               </span>
               <span class="crayon-v">
                i
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                scaled_category_data
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                model
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e6e778775216-13">
               <span class="crayon-h">
               </span>
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                '%s,AUC = %2.2f%%'
               </span>
               <span class="crayon-o">
                %
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                label
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                roc
               </span>
               <span class="crayon-o">
                *
               </span>
               <span class="crayon-cn">
                100
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e6e778775216-14">
               <span class="crayon-st">
                for
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                s
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                in
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-cn">
                0.001
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                0.01
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                0.1
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                1.0
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                10.0
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-o">
                :
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e6e778775216-15">
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                model
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                train_with_params
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                scaled_category_data
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                0.0
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                10
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                s
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e6e778775216-16">
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                label
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                roc
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                create_metrics
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-s">
                '%f step size'
               </span>
               <span class="crayon-o">
                %
               </span>
               <span class="crayon-v">
                s
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                scaled_category_data
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                model
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e6e778775216-17">
               <span class="crayon-h">
               </span>
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                '%s,AUC = %2.2f%%'
               </span>
               <span class="crayon-o">
                %
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                label
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                roc
               </span>
               <span class="crayon-o">
                *
               </span>
               <span class="crayon-cn">
                100
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e6e778775216-18">
               <span class="crayon-st">
                for
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                r
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                in
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-cn">
                0.001
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                0.01
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                0.1
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                1.0
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                10.0
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-o">
                :
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e6e778775216-19">
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                model
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                train_with_params
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                scaled_category_data
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                0.0
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                1.0
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                r
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e6e778775216-20">
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                label
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                roc
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                create_metrics
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-s">
                '%f regularization parameter'
               </span>
               <span class="crayon-o">
                %
               </span>
               <span class="crayon-v">
                r
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                scaled_category_data
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                model
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e6e778775216-21">
               <span class="crayon-h">
               </span>
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                '%s,AUC = %2.2f%%'
               </span>
               <span class="crayon-o">
                %
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                label
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                roc
               </span>
               <span class="crayon-o">
                *
               </span>
               <span class="crayon-cn">
                100
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
        </div>
       </div>
       <!-- [Format Time: 0.0408 seconds] -->
       <p>
       </p>
       <p>
        <img src="http://7xr3b9.com1.z0.glb.clouddn.com/mac_blogs_iterations.png"/>
        <br/>
        <img src="http://7xr3b9.com1.z0.glb.clouddn.com/mac_blogs_step_size.png"/>
        <br/>
        <img src="http://7xr3b9.com1.z0.glb.clouddn.com/mac_blogs_relu_params.png"/>
       </p>
       <h4 id="wiz_toc_21">
        Decision trees
       </h4>
       <h5 id="wiz_toc_22">
        Depth and impurity
       </h5>
       <p>
        决策树，我们来看看maxTreeDepth和impurity对最终决策树的性能影响：
       </p>
       <!-- Crayon Syntax Highlighter v_2.7.2_beta -->
       <div class="crayon-syntax crayon-theme-classic crayon-font-monaco crayon-os-pc print-yes notranslate" data-settings=" minimize scroll-mouseover" id="crayon-57685d3677e76318494367" style=" margin-top: 12px; margin-bottom: 12px; font-size: 12px !important; line-height: 15px !important;">
        <div class="crayon-toolbar" data-settings=" mouseover overlay hide delay" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
         <span class="crayon-title">
         </span>
         <div class="crayon-tools" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
          <div class="crayon-button crayon-nums-button" title="切换是否显示行编号">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-plain-button" title="纯文本显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-wrap-button" title="切换自动换行">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-expand-button" title="点击展开代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-copy-button" title="复制代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-popup-button" title="在新窗口中显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <span class="crayon-language">
           PowerShell
          </span>
         </div>
        </div>
        <div class="crayon-info" style="min-height: 16.8px !important; line-height: 16.8px !important;">
        </div>
        <div class="crayon-plain-wrap">
         <textarea class="crayon-plain print-no" data-settings="dblclick" readonly="" style="-moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4; font-size: 12px !important; line-height: 15px !important;" wrap="soft">
          def train_with_params_dt(input, impurity, maxTreeDepth):
    dt_model = DecisionTree.trainClassifier(input,numClass,{},impurity, maxDepth=maxTreeDepth)
    return dt_model
def create_metrics_dt(tag, data, model):
    predictList= model.predict(data.map(lambda lp: lp.features)).collect()
    trueLabel = data.map(lambda lp: lp.label).collect()
    scoresAndLabels = [(predictList[i],true_val) for i, true_val in enumerate(trueLabel)]
    scoresAndLabels_sc = sc.parallelize(scoresAndLabels)
    scoresAndLabels_sc = scoresAndLabels_sc.map(lambda (x,y): (float(x),float(y)))
    dt_metrics = BinaryClassificationMetrics(scoresAndLabels_sc)
    return tag,dt_metrics.areaUnderROC
for dep in [1,2,3,4,5,10,20]:
    for im in ['entropy','gini']:
        model=train_with_params_dt(data,im,dep)
        tag, roc = create_metrics_dt('impurity: %s, %d maxTreeDepth:'%(im,dep),data,model)
        print '%s, AUC = %2.2f'%(tag,roc*100)
         </textarea>
        </div>
        <div class="crayon-main" style="">
         <table class="crayon-table">
          <tbody>
           <tr class="crayon-row">
            <td class="crayon-nums " data-settings="show">
             <div class="crayon-nums-content" style="font-size: 12px !important; line-height: 15px !important;">
              <div class="crayon-num" data-line="crayon-57685d3677e76318494367-1">
               1
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e76318494367-2">
               2
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e76318494367-3">
               3
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e76318494367-4">
               4
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e76318494367-5">
               5
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e76318494367-6">
               6
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e76318494367-7">
               7
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e76318494367-8">
               8
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e76318494367-9">
               9
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e76318494367-10">
               10
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e76318494367-11">
               11
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e76318494367-12">
               12
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e76318494367-13">
               13
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e76318494367-14">
               14
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e76318494367-15">
               15
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e76318494367-16">
               16
              </div>
             </div>
            </td>
            <td class="crayon-code">
             <div class="crayon-pre" style="font-size: 12px !important; line-height: 15px !important; -moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4;">
              <div class="crayon-line" id="crayon-57685d3677e76318494367-1">
               <span class="crayon-e">
                def
               </span>
               <span class="crayon-e">
                train_with_params_dt
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-i">
                input
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                impurity
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                maxTreeDepth
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-o">
                :
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e76318494367-2">
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                dt_model
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                DecisionTree
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                trainClassifier
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-i">
                input
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-i">
                numClass
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-sy">
                {
               </span>
               <span class="crayon-sy">
                }
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-i">
                impurity
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                maxDepth
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-i">
                maxTreeDepth
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e76318494367-3">
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                return
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                dt_model
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e76318494367-4">
               <span class="crayon-e">
                def
               </span>
               <span class="crayon-e">
                create_metrics_dt
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-i">
                tag
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                data
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                model
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-o">
                :
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e76318494367-5">
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                predictList
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                model
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                predict
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-i">
                data
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-e">
                lambda
               </span>
               <span class="crayon-r ">
                lp
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-r ">
                lp
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-i">
                features
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                collect
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e76318494367-6">
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                trueLabel
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                data
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-e">
                lambda
               </span>
               <span class="crayon-r ">
                lp
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-r ">
                lp
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-i">
                label
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                collect
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e76318494367-7">
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                scoresAndLabels
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-i">
                predictList
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-i">
                i
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-i">
                true_val
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                for
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                i
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                true_val
               </span>
               <span class="crayon-st">
                in
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                enumerate
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-i">
                trueLabel
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                ]
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e76318494367-8">
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                scoresAndLabels_sc
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-r ">
                sc
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                parallelize
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-i">
                scoresAndLabels
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e76318494367-9">
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                scoresAndLabels_sc
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                scoresAndLabels_sc
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-e">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-i">
                x
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-i">
                y
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-t">
                float
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-i">
                x
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-t">
                float
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-i">
                y
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e76318494367-10">
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                dt_metrics
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                BinaryClassificationMetrics
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-i">
                scoresAndLabels_sc
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e76318494367-11">
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                return
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                tag
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-i">
                dt_metrics
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                areaUnderROC
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e76318494367-12">
               <span class="crayon-st">
                for
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                dep
               </span>
               <span class="crayon-st">
                in
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                [
               </span>
               1
               <span class="crayon-sy">
                ,
               </span>
               2
               <span class="crayon-sy">
                ,
               </span>
               3
               <span class="crayon-sy">
                ,
               </span>
               4
               <span class="crayon-sy">
                ,
               </span>
               5
               <span class="crayon-sy">
                ,
               </span>
               10
               <span class="crayon-sy">
                ,
               </span>
               20
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-o">
                :
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e76318494367-13">
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                for
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                im
               </span>
               <span class="crayon-st">
                in
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-s">
                'entropy'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-s">
                'gini'
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-o">
                :
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e76318494367-14">
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                model
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-e">
                train_with_params_dt
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-i">
                data
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-i">
                im
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-i">
                dep
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e76318494367-15">
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                tag
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                roc
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                create_metrics_dt
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-s">
                'impurity: %s, %d maxTreeDepth:'
               </span>
               <span class="crayon-o">
                %
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-i">
                im
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-i">
                dep
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-i">
                data
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-i">
                model
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e76318494367-16">
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                '%s, AUC = %2.2f'
               </span>
               <span class="crayon-o">
                %
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-i">
                tag
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-i">
                roc
               </span>
               <span class="crayon-o">
                *
               </span>
               100
               <span class="crayon-sy">
                )
               </span>
              </div>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
        </div>
       </div>
       <!-- [Format Time: 0.0284 seconds] -->
       <p>
       </p>
       <p>
        最终结果：
        <br/>
        <img src="http://7xr3b9.com1.z0.glb.clouddn.com/mac_blogs_dt_results.png"/>
       </p>
       <h4 id="wiz_toc_23">
        Naive Bayes Model
       </h4>
       <p>
        朴素贝叶斯模型比较简单，直接上代码吧：
       </p>
       <!-- Crayon Syntax Highlighter v_2.7.2_beta -->
       <div class="crayon-syntax crayon-theme-classic crayon-font-monaco crayon-os-pc print-yes notranslate" data-settings=" minimize scroll-mouseover" id="crayon-57685d3677e7d072937641" style=" margin-top: 12px; margin-bottom: 12px; font-size: 12px !important; line-height: 15px !important;">
        <div class="crayon-toolbar" data-settings=" mouseover overlay hide delay" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
         <span class="crayon-title">
         </span>
         <div class="crayon-tools" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
          <div class="crayon-button crayon-nums-button" title="切换是否显示行编号">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-plain-button" title="纯文本显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-wrap-button" title="切换自动换行">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-expand-button" title="点击展开代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-copy-button" title="复制代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-popup-button" title="在新窗口中显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <span class="crayon-language">
           Python
          </span>
         </div>
        </div>
        <div class="crayon-info" style="min-height: 16.8px !important; line-height: 16.8px !important;">
        </div>
        <div class="crayon-plain-wrap">
         <textarea class="crayon-plain print-no" data-settings="dblclick" readonly="" style="-moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4; font-size: 12px !important; line-height: 15px !important;" wrap="soft">
          def train_with_params_nb(input, lambda1):
    nb_model = NaiveBayes.train(input,lambda1)
    return nb_model
def create_metrics_nb(tag, nbbata, model):
    scoresAndLabels = nbdata.map(lambda point:(float(model.predict(point.features)),point.label))
    nb_metrics = BinaryClassificationMetrics(scoresAndLabels)
    return tag,nb_metrics.areaUnderROC
for la in [0.001, 0.01, 0.1, 1.0, 10.0]:
    model=train_with_params_nb(nbdata,la)
    tag, roc = create_metrics_dt('%f lambda' %la,data,model)
    print '%s, AUC = %2.2f'%(tag,roc*100)
         </textarea>
        </div>
        <div class="crayon-main" style="">
         <table class="crayon-table">
          <tbody>
           <tr class="crayon-row">
            <td class="crayon-nums " data-settings="show">
             <div class="crayon-nums-content" style="font-size: 12px !important; line-height: 15px !important;">
              <div class="crayon-num" data-line="crayon-57685d3677e7d072937641-1">
               1
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e7d072937641-2">
               2
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e7d072937641-3">
               3
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e7d072937641-4">
               4
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e7d072937641-5">
               5
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e7d072937641-6">
               6
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e7d072937641-7">
               7
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e7d072937641-8">
               8
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e7d072937641-9">
               9
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e7d072937641-10">
               10
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e7d072937641-11">
               11
              </div>
             </div>
            </td>
            <td class="crayon-code">
             <div class="crayon-pre" style="font-size: 12px !important; line-height: 15px !important; -moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4;">
              <div class="crayon-line" id="crayon-57685d3677e7d072937641-1">
               <span class="crayon-r">
                def
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                train_with_params_nb
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-k ">
                input
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                lambda1
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-o">
                :
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e7d072937641-2">
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                nb_model
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                NaiveBayes
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                train
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-k ">
                input
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                lambda1
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e7d072937641-3">
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                return
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                nb_model
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e7d072937641-4">
               <span class="crayon-r">
                def
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                create_metrics_nb
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                tag
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                nbbata
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                model
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-o">
                :
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e7d072937641-5">
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                scoresAndLabels
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                nbdata
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                point
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-k ">
                float
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                model
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                predict
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                point
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                features
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                point
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                label
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e7d072937641-6">
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                nb_metrics
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                BinaryClassificationMetrics
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                scoresAndLabels
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e7d072937641-7">
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                return
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                tag
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                nb_metrics
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                areaUnderROC
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e7d072937641-8">
               <span class="crayon-st">
                for
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                la
               </span>
               <span class="crayon-st">
                in
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-cn">
                0.001
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                0.01
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                0.1
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                1.0
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                10.0
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-o">
                :
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e7d072937641-9">
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                model
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-e">
                train_with_params_nb
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                nbdata
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                la
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e7d072937641-10">
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                tag
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                roc
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                create_metrics_dt
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-s">
                '%f lambda'
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                %
               </span>
               <span class="crayon-v">
                la
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                data
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                model
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e7d072937641-11">
               <span class="crayon-h">
               </span>
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                '%s, AUC = %2.2f'
               </span>
               <span class="crayon-o">
                %
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                tag
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                roc
               </span>
               <span class="crayon-o">
                *
               </span>
               <span class="crayon-cn">
                100
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
        </div>
       </div>
       <!-- [Format Time: 0.0229 seconds] -->
       <p>
       </p>
       <p>
        结果：
        <br/>
        <img src="http://7xr3b9.com1.z0.glb.clouddn.com/mac_blogs_nb_results.png"/>
       </p>
       <h3 id="wiz_toc_24">
        Cross-Validation
       </h3>
       <p>
        交叉验证，通常是离线评判的一种手段，尤其是对于小数据集，一般我们会对数据集进行按比率划分为训练集和测试机，然后用测试集下的数据来验证我们在训练集下的模型，这样会使模型更具有泛型能力，不多话，上代码：
       </p>
       <!-- Crayon Syntax Highlighter v_2.7.2_beta -->
       <div class="crayon-syntax crayon-theme-classic crayon-font-monaco crayon-os-pc print-yes notranslate" data-settings=" minimize scroll-mouseover" id="crayon-57685d3677e84493373162" style=" margin-top: 12px; margin-bottom: 12px; font-size: 12px !important; line-height: 15px !important;">
        <div class="crayon-toolbar" data-settings=" mouseover overlay hide delay" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
         <span class="crayon-title">
         </span>
         <div class="crayon-tools" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
          <div class="crayon-button crayon-nums-button" title="切换是否显示行编号">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-plain-button" title="纯文本显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-wrap-button" title="切换自动换行">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-expand-button" title="点击展开代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-copy-button" title="复制代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-popup-button" title="在新窗口中显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <span class="crayon-language">
           Python
          </span>
         </div>
        </div>
        <div class="crayon-info" style="min-height: 16.8px !important; line-height: 16.8px !important;">
        </div>
        <div class="crayon-plain-wrap">
         <textarea class="crayon-plain print-no" data-settings="dblclick" readonly="" style="-moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4; font-size: 12px !important; line-height: 15px !important;" wrap="soft">
          train_test_split = scaled_category_data.randomSplit([0.6,0.4],123)
train = train_test_split[0]
test = train_test_split[1]
for r in [0.0, 0.001, 0.0025, 0.005, 0.01]:
    model = train_with_params(train, 0.0, 1.0, r)
    label, roc = create_metrics('%f regularization parameter'%r,test,model)
    print '%s,AUC = %2.2f%%'%(label,roc*100)
         </textarea>
        </div>
        <div class="crayon-main" style="">
         <table class="crayon-table">
          <tbody>
           <tr class="crayon-row">
            <td class="crayon-nums " data-settings="show">
             <div class="crayon-nums-content" style="font-size: 12px !important; line-height: 15px !important;">
              <div class="crayon-num" data-line="crayon-57685d3677e84493373162-1">
               1
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e84493373162-2">
               2
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e84493373162-3">
               3
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e84493373162-4">
               4
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e84493373162-5">
               5
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57685d3677e84493373162-6">
               6
              </div>
              <div class="crayon-num" data-line="crayon-57685d3677e84493373162-7">
               7
              </div>
             </div>
            </td>
            <td class="crayon-code">
             <div class="crayon-pre" style="font-size: 12px !important; line-height: 15px !important; -moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4;">
              <div class="crayon-line" id="crayon-57685d3677e84493373162-1">
               <span class="crayon-v">
                train_test_split
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                scaled_category_data
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                randomSplit
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-cn">
                0.6
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-cn">
                0.4
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-cn">
                123
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e84493373162-2">
               <span class="crayon-v">
                train
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                train_test_split
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-cn">
                0
               </span>
               <span class="crayon-sy">
                ]
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e84493373162-3">
               <span class="crayon-k ">
                test
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                train_test_split
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-cn">
                1
               </span>
               <span class="crayon-sy">
                ]
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e84493373162-4">
               <span class="crayon-st">
                for
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                r
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                in
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-cn">
                0.0
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                0.001
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                0.0025
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                0.005
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                0.01
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-o">
                :
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e84493373162-5">
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                model
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                train_with_params
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                train
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                0.0
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                1.0
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                r
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57685d3677e84493373162-6">
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                label
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                roc
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                create_metrics
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-s">
                '%f regularization parameter'
               </span>
               <span class="crayon-o">
                %
               </span>
               <span class="crayon-v">
                r
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-k ">
                test
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                model
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57685d3677e84493373162-7">
               <span class="crayon-h">
               </span>
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                '%s,AUC = %2.2f%%'
               </span>
               <span class="crayon-o">
                %
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                label
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                roc
               </span>
               <span class="crayon-o">
                *
               </span>
               <span class="crayon-cn">
                100
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
        </div>
       </div>
       <!-- [Format Time: 0.0185 seconds] -->
       <p>
       </p>
       <div>
        <a href="http://www.wiz.cn/i/c1f0ca05" title="来自为知笔记(Wiz)">
         来自为知笔记(Wiz)
        </a>
       </div>
      </div>
      <div>
       <strong>
        注：转载文章均来自于公开网络，仅供学习使用，不会用于任何商业用途，如果侵犯到原作者的权益，请您与我们联系删除或者授权事宜，联系邮箱：contact@dataunion.org。转载数盟网站文章请注明原文章作者，否则产生的任何版权纠纷与数盟无关。
       </strong>
      </div>
      <!--content_text-->
      <div class="fenxian">
       <!-- JiaThis Button BEGIN -->
       <div class="jiathis_style_32x32">
        <p class="jiathis_button_weixin">
        </p>
        <p class="jiathis_button_tsina">
        </p>
        <p class="jiathis_button_qzone">
        </p>
        <p class="jiathis_button_cqq">
        </p>
        <p class="jiathis_button_tumblr">
        </p>
        <a class="jiathis jiathis_txt jtico jtico_jiathis" href="http://www.jiathis.com/share" target="_blank">
        </a>
        <p class="jiathis_counter_style">
        </p>
       </div>
       <!-- JiaThis Button END -->
      </div>
     </article>
     <!--content-->
     <!--相关文章-->
     <div class="xianguan">
      <div class="xianguantitle">
       相关文章！
      </div>
      <ul class="pic">
       <li>
        <a href="http://dataunion.org/24678.html">
         <img src="http://dataunion.org/wp-content/uploads/2016/06/20140917125452915416-216x200.jpg"/>
        </a>
        <a class="link" href="http://dataunion.org/24678.html" rel="bookmark" title="python3中的正则模块">
         python3中的正则模块
        </a>
       </li>
       <li>
        <a href="http://dataunion.org/24675.html">
         <img src="http://dataunion.org/wp-content/uploads/2016/06/t015b337bd75d9ef893-161x200.jpg"/>
        </a>
        <a class="link" href="http://dataunion.org/24675.html" rel="bookmark" title="注释是恶魔，请不要再写一行注释">
         注释是恶魔，请不要再写一行注释
        </a>
       </li>
       <li>
        <a href="http://dataunion.org/24660.html">
         <img src="http://dataunion.org/wp-content/uploads/2016/06/u16130037972892789947fm21gp0-300x157.jpg"/>
        </a>
        <a class="link" href="http://dataunion.org/24660.html" rel="bookmark" title="如何从Github上轻松安装R包">
         如何从Github上轻松安装R包
        </a>
       </li>
       <li>
        <a href="http://dataunion.org/24654.html">
         <img src="http://dataunion.org/wp-content/uploads/2016/06/df53fac99fc53ba5a90666abcca25e6d_b-267x200.png"/>
        </a>
        <a class="link" href="http://dataunion.org/24654.html" rel="bookmark" title="简单形象又有趣地说说强大的神经网络">
         简单形象又有趣地说说强大的神经网络
        </a>
       </li>
      </ul>
     </div>
     <!--相关文章-->
     <div class="comment" id="comments">
      <!-- You can start editing here. -->
      <!-- If comments are open, but there are no comments. -->
      <div class="title">
       期待你一针见血的评论，Come on！
      </div>
      <div id="respond">
       <p>
        不用想啦，马上
        <a href="http://dataunion.org/wp-login.php?redirect_to=http%3A%2F%2Fdataunion.org%2F22574.html">
         "登录"
        </a>
        发表自已的想法.
       </p>
      </div>
     </div>
     <!-- .nav-single -->
    </div>
    <!--Container End-->
    <aside id="sitebar">
     <div class="sitebar_list2">
      <div class="wptag">
       <span class="tagtitle">
        热门标签+
       </span>
       <div class="tagg">
        <ul class="menu" id="menu-%e5%8f%8b%e6%83%85%e9%93%be%e6%8e%a5">
         <li class="menu-item menu-item-type-custom menu-item-object-custom menu-item-1605" id="menu-item-1605">
          <a href="http://taidizh.com/">
           泰迪智慧
          </a>
         </li>
         <li class="menu-item menu-item-type-custom menu-item-object-custom menu-item-20884" id="menu-item-20884">
          <a href="http://www.transwarp.cn/">
           星环科技
          </a>
         </li>
         <li class="menu-item menu-item-type-custom menu-item-object-custom menu-item-3538" id="menu-item-3538">
          <a href="http://datall.org/">
           珈和遥感
          </a>
         </li>
         <li class="menu-item menu-item-type-custom menu-item-object-custom menu-item-20888" id="menu-item-20888">
          <a href="http://www.chinahadoop.cn/">
           小象学院
          </a>
         </li>
        </ul>
       </div>
      </div>
     </div>
     <div class="sitebar_list">
      <div class="textwidget">
       <div align="center">
        <a href="http://study.163.com/course/courseMain.htm?courseId=991022" target="_blank">
         <img src="http://dataunion.org/wp-content/uploads/2016/03/dv.jpg"/>
        </a>
       </div>
      </div>
     </div>
     <div class="sitebar_list">
      <h4 class="sitebar_title">
       文章分类
      </h4>
      <div class="tagcloud">
       <a class="tag-link-44" href="http://dataunion.org/category/industry/demo" style="font-size: 10.204724409449pt;" title="4个话题">
        Demo展示
       </a>
       <a class="tag-link-31" href="http://dataunion.org/category/experts" style="font-size: 15.826771653543pt;" title="52个话题">
        专家团队
       </a>
       <a class="tag-link-870" href="http://dataunion.org/category/tech/ai" style="font-size: 19.795275590551pt;" title="273个话题">
        人工智能
       </a>
       <a class="tag-link-488" href="http://dataunion.org/category/%e5%8a%a0%e5%85%a5%e6%95%b0%e7%9b%9f" style="font-size: 8pt;" title="1个话题">
        加入数盟
       </a>
       <a class="tag-link-869" href="http://dataunion.org/category/tech/viz" style="font-size: 17.204724409449pt;" title="93个话题">
        可视化
       </a>
       <a class="tag-link-30" href="http://dataunion.org/category/partners" style="font-size: 10.645669291339pt;" title="5个话题">
        合作伙伴
       </a>
       <a class="tag-link-889" href="http://dataunion.org/category/parterc" style="font-size: 11.582677165354pt;" title="8个话题">
        合作会议
       </a>
       <a class="tag-link-104" href="http://dataunion.org/category/books" style="font-size: 12.96062992126pt;" title="15个话题">
        图书
       </a>
       <a class="tag-link-220" href="http://dataunion.org/category/tech/base" style="font-size: 19.850393700787pt;" title="281个话题">
        基础架构
       </a>
       <a class="tag-link-219" href="http://dataunion.org/category/tech/analysis" style="font-size: 19.409448818898pt;" title="232个话题">
        数据分析
       </a>
       <a class="tag-link-887" href="http://dataunion.org/category/tech/dm" style="font-size: 13.291338582677pt;" title="17个话题">
        数据挖掘
       </a>
       <a class="tag-link-34" href="http://dataunion.org/category/tech" style="font-size: 20.732283464567pt;" title="404个话题">
        文章
       </a>
       <a class="tag-link-1" href="http://dataunion.org/category/uncategorized" style="font-size: 22pt;" title="693个话题">
        未分类
       </a>
       <a class="tag-link-4" href="http://dataunion.org/category/events" style="font-size: 14.503937007874pt;" title="29个话题">
        活动
       </a>
       <a class="tag-link-890" href="http://dataunion.org/category/tech/%e6%b7%b1%e5%ba%a6%e5%ad%a6%e4%b9%a0" style="font-size: 10.204724409449pt;" title="4个话题">
        深度学习
       </a>
       <a class="tag-link-221" href="http://dataunion.org/category/tech/devl" style="font-size: 18.968503937008pt;" title="193个话题">
        编程语言
       </a>
       <a class="tag-link-888" href="http://dataunion.org/category/career" style="font-size: 15.661417322835pt;" title="48个话题">
        职业规划
       </a>
       <a class="tag-link-5" href="http://dataunion.org/category/jobs" style="font-size: 14.11811023622pt;" title="25个话题">
        职位
       </a>
       <a class="tag-link-871" href="http://dataunion.org/category/industry" style="font-size: 15.716535433071pt;" title="49个话题">
        行业
       </a>
       <a class="tag-link-613" href="http://dataunion.org/category/industry/case" style="font-size: 16.984251968504pt;" title="84个话题">
        行业应用
       </a>
       <a class="tag-link-885" href="http://dataunion.org/category/industry/news" style="font-size: 17.425196850394pt;" title="102个话题">
        行业资讯
       </a>
       <a class="tag-link-10" href="http://dataunion.org/category/training" style="font-size: 14.228346456693pt;" title="26个话题">
        课程
       </a>
       <a class="tag-link-16" href="http://dataunion.org/category/sources" style="font-size: 15.661417322835pt;" title="48个话题">
        资源
       </a>
      </div>
     </div>
     <div class="sitebar_list">
      <h4 class="sitebar_title">
       功能
      </h4>
      <ul>
       <li>
        <a href="http://dataunion.org/wp-login.php?action=register">
         注册
        </a>
       </li>
       <li>
        <a href="http://dataunion.org/wp-login.php">
         登录
        </a>
       </li>
       <li>
        <a href="http://dataunion.org/feed">
         文章
         <abbr title="Really Simple Syndication">
          RSS
         </abbr>
        </a>
       </li>
       <li>
        <a href="http://dataunion.org/comments/feed">
         评论
         <abbr title="Really Simple Syndication">
          RSS
         </abbr>
        </a>
       </li>
       <li>
        <a href="https://cn.wordpress.org/" title="基于WordPress，一个优美、先进的个人信息发布平台。">
         WordPress.org
        </a>
       </li>
      </ul>
     </div>
    </aside>
    <div class="clear">
    </div>
   </div>
   <!--main-->
   ﻿
   <footer id="dibu">
    <div class="about">
     <div class="right">
      <ul class="menu" id="menu-%e5%ba%95%e9%83%a8%e8%8f%9c%e5%8d%95">
       <li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-18024" id="menu-item-18024">
        <a href="http://dataunion.org/category/partners">
         合作伙伴
        </a>
       </li>
       <li class="menu-item menu-item-type-post_type menu-item-object-page menu-item-20881" id="menu-item-20881">
        <a href="http://dataunion.org/contribute">
         文章投稿
        </a>
       </li>
       <li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-20872" id="menu-item-20872">
        <a href="http://dataunion.org/category/%e5%8a%a0%e5%85%a5%e6%95%b0%e7%9b%9f">
         加入数盟
        </a>
       </li>
       <li class="menu-item menu-item-type-post_type menu-item-object-page menu-item-22441" id="menu-item-22441">
        <a href="http://dataunion.org/f-links">
         友情链接
        </a>
       </li>
       <li class="menu-item menu-item-type-post_type menu-item-object-page menu-item-20874" id="menu-item-20874">
        <a href="http://dataunion.org/aboutus">
         关于数盟
        </a>
       </li>
      </ul>
      <p class="banquan">
       数盟社区        ，
        做最棒的数据科学社区
      </p>
     </div>
     <div class="left">
      <ul class="bottomlist">
       <li>
        <a href="http://weibo.com/DataScientistUnion  " target="_blank" 　title="">
         <img src="http://dataunion.org/wp-content/themes/yzipi/images/weibo.png"/>
        </a>
       </li>
       <li>
        <a class="cd-popup-trigger" href="http://dataunion.org/22574.html#0">
         <img src="http://dataunion.org/wp-content/themes/yzipi/images/weixin.png"/>
        </a>
       </li>
      </ul>
      <div class="cd-popup">
       <div class="cd-popup-container">
        <h1>
         扫描二维码,加微信公众号
        </h1>
        <img src="http://dataunion.org/wp-content/themes/yzipi/images/2014-12-06-1515289049.png"/>
        <a class="cd-popup-close" href="http://dataunion.org/22574.html">
        </a>
       </div>
       <!-- cd-popup-container -->
      </div>
      <!-- cd-popup -->
     </div>
    </div>
    <!--about-->
    <div class="bottom">
     <a href="http://dataunion.org/">
      数盟社区
     </a>
     <a href="http://www.miitbeian.gov.cn/" rel="external nofollow" target="_blank">
      京ICP备14026740号
     </a>
     联系我们：
     <a href="mailto:contact@dataunion.org" target="_blank">
      contact@dataunion.org
     </a>
     <div class="tongji">
     </div>
     <!--bottom-->
     <div class="scroll" id="scroll" style="display:none;">
      ︿
     </div>
    </div>
   </footer>
   <!--dibu-->
  </div>
 </body>
</html>